Merge branch 'main' into tpopp/fix_aiter_triton_rope

This commit is contained in:
Cyrus Leung 2025-12-24 13:36:53 +08:00 committed by GitHub
commit de36a71f03
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
145 changed files with 3763 additions and 1846 deletions

View File

@ -162,7 +162,10 @@ steps:
- tests/entrypoints/test_chat_utils - tests/entrypoints/test_chat_utils
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/test_vision_embeds.py
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
# TODO: Remove after next torch update
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s entrypoints/openai/test_vision_embeds.py
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration Test (API Server 2) - label: Entrypoints Integration Test (API Server 2)
@ -349,7 +352,9 @@ steps:
- label: V1 Test e2e + engine # 65min - label: V1 Test e2e + engine # 65min
timeout_in_minutes: 90 timeout_in_minutes: 90
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_4 # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability.
# See discussion here: https://github.com/vllm-project/vllm/pull/31040
agent_pool: mi325_8
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
@ -977,7 +982,10 @@ steps:
- export MIOPEN_DEBUG_CONV_GEMM=0 - export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch' - pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
# Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
# TODO: Remove after next torch update
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 5min - label: Multi-Modal Accuracy Eval (Small Models) # 5min
@ -1339,7 +1347,9 @@ steps:
# end platform plugin tests # end platform plugin tests
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
- pip install -e ./plugins/prithvi_io_processor_plugin - pip install -e ./plugins/prithvi_io_processor_plugin
- pytest -v -s plugins_tests/test_io_processor_plugins.py # Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
# TODO: Remove after next torch update
- VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s plugins_tests/test_io_processor_plugins.py
- pip uninstall prithvi_io_processor_plugin -y - pip uninstall prithvi_io_processor_plugin -y
# end io_processor plugins test # end io_processor plugins test
# begin stat_logger plugins test # begin stat_logger plugins test

View File

@ -104,7 +104,6 @@ def run_benchmark_with_batch_invariant(
random.seed(seed) random.seed(seed)
# Set environment variables # Set environment variables
os.environ["VLLM_ATTENTION_BACKEND"] = backend
if batch_invariant: if batch_invariant:
os.environ["VLLM_BATCH_INVARIANT"] = "1" os.environ["VLLM_BATCH_INVARIANT"] = "1"
else: else:
@ -140,6 +139,7 @@ def run_benchmark_with_batch_invariant(
max_model_len=max_model_len, max_model_len=max_model_len,
dtype="bfloat16", dtype="bfloat16",
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
attention_config={"backend": backend},
enable_prefix_caching=False, enable_prefix_caching=False,
) )
init_time = time.perf_counter() - start_init init_time = time.perf_counter() - start_init

View File

@ -9,16 +9,6 @@
void swap_blocks(torch::Tensor& src, torch::Tensor& dst, void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
const torch::Tensor& block_mapping); const torch::Tensor& block_mapping);
// Note: the key_caches and value_caches vectors are constant but
// not the Tensors they contain. The vectors need to be const refs
// in order to satisfy pytorch's C++ operator registration code.
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
std::vector<torch::Tensor> const& value_caches,
const torch::Tensor& block_mapping);
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
const torch::Tensor& block_mapping);
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& key_cache, torch::Tensor& value_cache,
torch::Tensor& slot_mapping, torch::Tensor& slot_mapping,

View File

@ -119,94 +119,6 @@ __global__ void copy_blocks_mla_kernel(
} // namespace vllm } // namespace vllm
// Note: the key_caches and value_caches vectors are constant but
// not the Tensors they contain. The vectors need to be const refs
// in order to satisfy pytorch's C++ operator registration code.
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
std::vector<torch::Tensor> const& value_caches,
const torch::Tensor& block_mapping) {
int num_layers = key_caches.size();
TORCH_CHECK(num_layers == value_caches.size());
if (num_layers == 0) {
return;
}
torch::Device cache_device = key_caches[0].device();
TORCH_CHECK(cache_device.is_cuda());
// Create data structures for the kernel.
// Create an array of pointers to the key and value caches.
int64_t key_cache_ptrs[num_layers];
int64_t value_cache_ptrs[num_layers];
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
key_cache_ptrs[layer_idx] =
reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
value_cache_ptrs[layer_idx] =
reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
}
// block_mapping is a 2D tensor with shape (num_pairs, 2).
int num_pairs = block_mapping.size(0);
// Move the data structures to the GPU.
// NOTE: This synchronizes the CPU and GPU.
torch::Tensor key_cache_ptrs_tensor =
torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
.to(cache_device);
torch::Tensor value_cache_ptrs_tensor =
torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
.to(cache_device);
// Launch the kernel.
const int numel_per_block = key_caches[0][0].numel();
dim3 grid(num_layers, num_pairs);
dim3 block(std::min(1024, numel_per_block));
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
key_cache_ptrs_tensor.data_ptr<int64_t>(),
value_cache_ptrs_tensor.data_ptr<int64_t>(),
block_mapping.data_ptr<int64_t>(), numel_per_block);
}));
}
// copy blocks kernel for MLA (assumes a joint KV-cache)
void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
const torch::Tensor& block_mapping) {
int num_layers = kv_caches.size();
if (num_layers == 0) {
return;
}
torch::Device cache_device = kv_caches[0].device();
TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
std::vector<int64_t> cache_ptrs(num_layers);
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
cache_ptrs[layer_idx] =
reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
}
torch::Tensor cache_ptrs_tensor =
torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
.to(cache_device);
int num_pairs = block_mapping.size(0);
// We use the stride instead of numel in case the cache is padded for memory
// alignment reasons, we assume the blocks data (inclusive of any padding)
// is contiguous in memory
int mem_footprint_per_block = kv_caches[0].stride(0);
dim3 grid(num_layers, num_pairs);
dim3 block(std::min(1024, mem_footprint_per_block));
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
cache_ptrs_tensor.data_ptr<int64_t>(),
block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
}));
}
namespace vllm { namespace vllm {
// Used to copy/convert one element // Used to copy/convert one element
@ -539,9 +451,6 @@ __global__ void indexer_k_quant_and_cache_kernel(
for (int i = 0; i < VEC_SIZE; i++) { for (int i = 0; i < VEC_SIZE; i++) {
amax = fmaxf(amax, fabsf(float(k_val_ptr[i]))); amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
} }
#ifndef USE_ROCM
__syncwarp();
#endif
// Reduced amax // Reduced amax
for (int mask = 16; mask > 0; mask /= 2) { for (int mask = 16; mask > 0; mask /= 2) {
@ -551,9 +460,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask)); amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
#endif #endif
} }
#ifndef USE_ROCM
__syncwarp();
#endif
#if defined(__gfx942__) #if defined(__gfx942__)
float scale = fmaxf(amax, 1e-4) / 224.0f; float scale = fmaxf(amax, 1e-4) / 224.0f;
#else #else

View File

@ -405,7 +405,6 @@ void fused_qk_norm_rope(
qkv.scalar_type() == k_weight.scalar_type(), qkv.scalar_type() == k_weight.scalar_type(),
"qkv, q_weight and k_weight must have the same dtype"); "qkv, q_weight and k_weight must have the same dtype");
int64_t rotary_dim = cos_sin_cache.size(1);
int64_t num_tokens = qkv.size(0); int64_t num_tokens = qkv.size(0);
TORCH_CHECK(position_ids.size(0) == num_tokens, TORCH_CHECK(position_ids.size(0) == num_tokens,
"Number of tokens in position_ids must match QKV"); "Number of tokens in position_ids must match QKV");

View File

@ -35,7 +35,7 @@ template <typename Int>
__host__ __device__ inline Int round_up(Int x, Int y) { __host__ __device__ inline Int round_up(Int x, Int y) {
static_assert(std::is_integral_v<Int>, static_assert(std::is_integral_v<Int>,
"round_up argument must be integral type"); "round_up argument must be integral type");
return (x + y - 1) / y * y; return ((x + y - 1) / y) * y;
} }
// Compute effective rows for grid configuration with swizzled SF layouts. // Compute effective rows for grid configuration with swizzled SF layouts.
@ -61,37 +61,47 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
int sf_m = round_up<int>(numRows, 128); int sf_m = round_up<int>(numRows, 128);
int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE; int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4; int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) { int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;
// Each thread writes 4 uint32_t elements.
for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
col += blockDim.x * 4) {
SFout[row * sf_n_int + col] = 0x00;
}
}
// Get the global scaling factor, which will be applied to the SF. // Get the global scaling factor, which will be applied to the SF.
// Note SFScale is the same as next GEMM's alpha, which is // Note SFScale is the same as next GEMM's alpha, which is
// (448.f / (Alpha_A / 6.f)). // (448.f / (Alpha_A / 6.f)).
float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0]; float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
// Input tensor row/col loops. // Iterate over all rows and cols including padded ones -
for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) { // ensures we visit every single scale factor address to initialize it.
for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD; for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
for (int colIdx = threadIdx.x;
colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
colIdx += blockDim.x) { colIdx += blockDim.x) {
int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
PackedVec in_vec;
int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx; int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
// Get the output tensor offset. // If we are outside valid rows OR outside valid columns -> Use Zeros
// Same as inOffset because 8 elements are packed into one uint32_t. if (rowIdx >= numRows || elem_idx >= numCols) {
int64_t outOffset = inOffset; memset(&in_vec, 0, sizeof(PackedVec));
auto& out_pos = out[outOffset];
} else {
// Valid Region: Load actual data
in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
}
auto sf_out = auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t, cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>( CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx, colIdx, numKTiles, SFout); rowIdx, colIdx, numKTiles, SFout);
out_pos = auto out_val =
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out); cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
// We do NOT write output for padding because the 'out' tensor is not
// padded.
if (rowIdx < numRows && elem_idx < numCols) {
// Same as inOffset because 8 elements are packed into one uint32_t.
out[inOffset] = out_val;
}
} }
} }
} }
@ -134,4 +144,4 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr), m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
reinterpret_cast<uint32_t*>(sf_out)); reinterpret_cast<uint32_t*>(sf_out));
}); });
} }

View File

@ -685,16 +685,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
"swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()"); "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks); cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
// Copy the cache blocks from src to dst.
cache_ops.def(
"copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
"Tensor block_mapping) -> ()");
cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
cache_ops.def(
"copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
cache_ops.impl("copy_blocks_mla", torch::kCUDA, &copy_blocks_mla);
// Reshape the key and value tensors and cache them. // Reshape the key and value tensors and cache them.
cache_ops.def( cache_ops.def(
"reshape_and_cache(Tensor key, Tensor value," "reshape_and_cache(Tensor key, Tensor value,"

View File

@ -183,7 +183,7 @@ ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads ENV NVCC_THREADS=$nvcc_threads
ARG USE_SCCACHE ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_REGION_NAME=us-west-2
@ -201,10 +201,16 @@ ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$USE_SCCACHE" = "1" ]; then \ if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \ echo "Installing sccache..." \
&& case "${TARGETPLATFORM}" in \
linux/arm64) SCCACHE_ARCH="aarch64" ;; \
linux/amd64) SCCACHE_ARCH="x86_64" ;; \
*) echo "Unsupported TARGETPLATFORM for sccache: ${TARGETPLATFORM}" >&2; exit 1 ;; \
esac \
&& export SCCACHE_DOWNLOAD_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
&& curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \ && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
&& tar -xzf sccache.tar.gz \ && tar -xzf sccache.tar.gz \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \ && sudo mv sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \ && rm -rf sccache.tar.gz sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl \
&& if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \ && if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \ && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \ && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \

View File

@ -1,5 +1,5 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
ARG TRITON_BRANCH="a272dfa8" ARG TRITON_BRANCH="57c693b6"
ARG TRITON_REPO="https://github.com/ROCm/triton.git" ARG TRITON_REPO="https://github.com/ROCm/triton.git"
ARG PYTORCH_BRANCH="89075173" ARG PYTORCH_BRANCH="89075173"
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
@ -162,4 +162,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt

View File

@ -2,7 +2,7 @@ FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \ echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
add-apt-repository -y ppa:kobuk-team/intel-graphics add-apt-repository -y ppa:kobuk-team/intel-graphics-staging
RUN apt clean && apt-get update -y && \ RUN apt clean && apt-get update -y && \
apt-get install -y --no-install-recommends --fix-missing \ apt-get install -y --no-install-recommends --fix-missing \
@ -47,6 +47,11 @@ RUN --mount=type=cache,target=/root/.cache/pip \
pip install --no-cache-dir \ pip install --no-cache-dir \
-r requirements/xpu.txt -r requirements/xpu.txt
# arctic-inference is built from source which needs torch-xpu properly installed
# used for suffix method speculative decoding
RUN --mount=type=cache,target=/root/.cache/pip \
pip install --no-cache-dir arctic-inference==0.1.1
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/" ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
COPY . . COPY . .

View File

@ -2,4 +2,4 @@
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving. vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe. You can use vLLM with KServe's [Hugging Face serving runtime](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) or via [`LLMInferenceService` that uses llm-d](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).

View File

@ -0,0 +1,5 @@
# llm-d
vLLM can be deployed with [llm-d](https://github.com/llm-d/llm-d), a Kubernetes-native distributed inference serving stack providing well-lit paths for anyone to serve large generative AI models at scale. It helps achieve the fastest "time to state-of-the-art (SOTA) performance" for key OSS models across most hardware accelerators and infrastructure providers.
You can use vLLM with llm-d directly by following [this guide](https://llm-d.ai/docs/guide) or via [KServe's LLMInferenceService](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).

View File

@ -12,6 +12,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
- [Helm](frameworks/helm.md) - [Helm](frameworks/helm.md)
- [InftyAI/llmaz](integrations/llmaz.md) - [InftyAI/llmaz](integrations/llmaz.md)
- [llm-d](integrations/llm-d.md)
- [KAITO](integrations/kaito.md) - [KAITO](integrations/kaito.md)
- [KServe](integrations/kserve.md) - [KServe](integrations/kserve.md)
- [Kthena](integrations/kthena.md) - [Kthena](integrations/kthena.md)

View File

@ -64,7 +64,7 @@ th:not(:first-child) {
| [CP](../configuration/optimization.md#chunked-prefill) | [](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [CP](../configuration/optimization.md#chunked-prefill) | [](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [APC](automatic_prefix_caching.md) | [](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [APC](automatic_prefix_caching.md) | [](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [🟠](https://github.com/vllm-project/vllm/issues/26963) | | [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | |
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [](https://github.com/vllm-project/vllm/issues/26970) | | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [](https://github.com/vllm-project/vllm/issues/26970) |
| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ |

View File

@ -28,3 +28,4 @@ The backends below live **outside** the main `vllm` repository and follow the
| Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> | | Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
| Baidu Kunlun XPU | N/A, install from source | <https://github.com/baidu/vLLM-Kunlun> | | Baidu Kunlun XPU | N/A, install from source | <https://github.com/baidu/vLLM-Kunlun> |
| Sophgo TPU | N/A, install from source | <https://github.com/sophgo/vllm-tpu> | | Sophgo TPU | N/A, install from source | <https://github.com/sophgo/vllm-tpu> |
| Apple Silicon (Metal) | N/A, install from source | <https://github.com/vllm-project/vllm-metal> |

View File

@ -4,6 +4,9 @@ vLLM has experimental support for macOS with Apple Silicon. For now, users must
Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
!!! tip "GPU-Accelerated Inference with vLLM-Metal"
For GPU-accelerated inference on Apple Silicon using Metal, check out [vllm-metal](https://github.com/vllm-project/vllm-metal), a community-maintained hardware plugin that uses MLX as the compute backend.
# --8<-- [end:installation] # --8<-- [end:installation]
# --8<-- [start:requirements] # --8<-- [start:requirements]

View File

@ -490,6 +490,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | | `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | |
| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | | `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | |
| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | | `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | |
| `LlamaBidirectionalModel`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-embed-1b-v2`, etc. | ✅︎ | ✅︎ |
| `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | | `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ |
| `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | | `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ |
| `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | | `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ |
@ -543,8 +544,9 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | |
| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ |
| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | | | `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | |
| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | | `LlamaBidirectionalForSequenceClassification`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2` (see note), etc. | ✅︎ | ✅︎ |
| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | | `Qwen2ForSequenceClassification`<sup>C</sup> | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ |
| `Qwen3ForSequenceClassification`<sup>C</sup> | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ |
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | |
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | | `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | |
| `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* |
@ -562,6 +564,11 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
!!! note !!! note
The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture. The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture.
!!! note
`nvidia/llama-nemotron-rerank-1b-v2` require a specific prompt format to work correctly.
Examples : [offline_using_template.py](../../examples/pooling/score/offline_using_template.py) [online_using_template.py](../../examples/pooling/score/online_using_template.py)
!!! note !!! note
Load the official original `mxbai-rerank-v2` by using the following command. Load the official original `mxbai-rerank-v2` by using the following command.

View File

@ -16,7 +16,7 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`
from langchain_community.llms import VLLM from langchain_community.llms import VLLM
llm = VLLM( llm = VLLM(
model="mosaicml/mpt-7b", model="Qwen/Qwen3-4B",
trust_remote_code=True, # mandatory for hf models trust_remote_code=True, # mandatory for hf models
max_new_tokens=128, max_new_tokens=128,
top_k=10, top_k=10,

View File

@ -669,6 +669,21 @@ You can find the documentation for cross encoder models at [sbert.net](https://w
Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py) Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py)
#### Score Template
Some scoring models require a specific prompt format to work correctly. You can specify a custom score template using the `--chat-template` parameter (see [Chat Template](#chat-template)).
Score templates are supported for **cross-encoder** models only. If you are using an **embedding** model for scoring, vLLM does not apply a score template.
Like chat templates, the score template receives a `messages` list. For scoring, each message has a `role` attribute—either `"query"` or `"document"`. For the usual kind of point-wise cross-encoder, you can expect exactly two messages: one query and one document. To access the query and document content, use Jinja's `selectattr` filter:
- **Query**: `{{ (messages | selectattr("role", "eq", "query") | first).content }}`
- **Document**: `{{ (messages | selectattr("role", "eq", "document") | first).content }}`
This approach is more robust than index-based access (`messages[0]`, `messages[1]`) because it selects messages by their semantic role. It also avoids assumptions about message ordering if additional message types are added to `messages` in the future.
Example template file: [examples/pooling/score/template/nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja)
#### Single inference #### Single inference
You can pass a string to both `text_1` and `text_2`, forming a single sentence pair. You can pass a string to both `text_1` and `text_2`, forming a single sentence pair.

View File

@ -0,0 +1,27 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
from pathlib import Path
from vllm import LLM
model_name = "nvidia/llama-nemotron-rerank-1b-v2"
# Path to template file
template_path = Path(__file__).parent / "template" / "nemotron-rerank.jinja"
chat_template = template_path.read_text()
llm = LLM(model=model_name, runner="pooling", trust_remote_code=True)
query = "how much protein should a female eat?"
documents = [
"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.",
"Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
]
outputs = llm.score(query, documents, chat_template=chat_template)
print("-" * 30)
print([output.outputs.score for output in outputs])
print("-" * 30)

View File

@ -0,0 +1,46 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Example of using the rerank API with template.
run:
vllm serve nvidia/llama-nemotron-rerank-1b-v2 --runner pooling --trust-remote-code --chat-template examples/pooling/score/template/nemotron-rerank.jinja
"""
import json
import requests
url = "http://127.0.0.1:8000/rerank"
headers = {"accept": "application/json", "Content-Type": "application/json"}
query = "how much protein should a female eat?"
documents = [
"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.",
"Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
]
data = {
"model": "nvidia/llama-nemotron-rerank-1b-v2",
"query": query,
"documents": documents,
}
def main():
response = requests.post(url, headers=headers, json=data)
# Check the response
if response.status_code == 200:
print("Request successful!")
print(json.dumps(response.json(), indent=2))
else:
print(f"Request failed with status code: {response.status_code}")
print(response.text)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,3 @@
question:{{ (messages | selectattr("role", "eq", "query") | first).content }}
passage:{{ (messages | selectattr("role", "eq", "document") | first).content }}

View File

@ -557,7 +557,8 @@ def test_rms_group_quant(
# To capture subprocess logs, we need to know whether spawn or fork is used. # To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general. # Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
# Testing properties # Testing properties

View File

@ -77,6 +77,7 @@ def test_dynamic_shapes_compilation(
"evaluate_guards": evaluate_guards, "evaluate_guards": evaluate_guards,
}, },
}, },
max_model_len=1024,
) )
output = model.generate(prompt) output = model.generate(prompt)

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
import pytest import pytest
import torch import torch
@ -53,37 +52,61 @@ class TestModel(torch.nn.Module):
hidden_size: int, hidden_size: int,
eps: float, eps: float,
group_shape: GroupShape, group_shape: GroupShape,
cuda_force_torch: bool, use_aiter: bool = False,
cuda_force_torch: bool = False,
use_aiter_quant_op: bool = True,
*args, *args,
**kwargs, **kwargs,
): ):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.use_aiter = use_aiter
self.use_aiter_quant_op = use_aiter_quant_op
self.cuda_force_torch = cuda_force_torch self.cuda_force_torch = cuda_force_torch
self.group_shape = group_shape
self.enable_quant_fp8_custom_op = None # Will be set later if applicable
self.norm = [RMSNorm(hidden_size, eps) for _ in range(4)] self.norm = [RMSNorm(hidden_size, eps) for _ in range(4)]
if group_shape.is_per_group():
self.wscale = [ # Setup quantization scale descriptor
torch.rand( static = group_shape == GroupShape.PER_TENSOR and not use_aiter
(hidden_size // group_shape[1], hidden_size // group_shape[1]),
dtype=torch.float32,
)
for _ in range(3)
]
else:
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
static = group_shape == GroupShape.PER_TENSOR
quant_scale = ScaleDesc(torch.float32, static, group_shape) quant_scale = ScaleDesc(torch.float32, static, group_shape)
self.quant_key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True) self.quant_key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
# Setup scales
if static: if static:
self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)] self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
else: else:
self.scale = [None for _ in range(3)] self.scale = [None for _ in range(3)]
# Setup weights
self.w = [ self.w = [
torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE) for _ in range(3) torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE) for _ in range(3)
] ]
if not group_shape.is_per_group(): if not group_shape.is_per_group() or use_aiter:
self.w = [self.w[0].t() for _ in range(3)] self.w = [self.w[0].t() for _ in range(3)]
# Setup weight scales
if group_shape.is_per_group(): if group_shape.is_per_group():
scale_size = (
(hidden_size + 128 - 1) // 128
if use_aiter
else hidden_size // group_shape[1]
)
wscale_shape: tuple[int, ...] = (scale_size, scale_size)
else:
wscale_shape = (1,)
self.wscale = [torch.rand(wscale_shape, dtype=torch.float32) for _ in range(3)]
# Setup FP8 linear operation
is_per_group = group_shape.is_per_group()
if is_per_group and use_aiter:
self.fp8_linear = W8A8BlockFp8LinearOp(
weight_group_shape=GroupShape(128, 128),
act_quant_group_shape=group_shape,
use_aiter_and_is_supported=use_aiter_quant_op,
)
# AITER blockwise doesn't use enable_quant_fp8_custom_op
elif is_per_group:
self.fp8_linear = W8A8BlockFp8LinearOp( self.fp8_linear = W8A8BlockFp8LinearOp(
weight_group_shape=GroupShape(group_shape[1], group_shape[1]), weight_group_shape=GroupShape(group_shape[1], group_shape[1]),
act_quant_group_shape=group_shape, act_quant_group_shape=group_shape,
@ -91,6 +114,13 @@ class TestModel(torch.nn.Module):
use_aiter_and_is_supported=False, use_aiter_and_is_supported=False,
) )
self.enable_quant_fp8_custom_op = self.fp8_linear.input_quant_op.enabled() self.enable_quant_fp8_custom_op = self.fp8_linear.input_quant_op.enabled()
elif use_aiter:
self.fp8_linear = Fp8LinearOp(
act_quant_static=False,
act_quant_group_shape=group_shape,
)
self.fp8_linear.quant_fp8.use_aiter = use_aiter_quant_op
self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
else: else:
with override_cutlass_fp8_supported(not cuda_force_torch): with override_cutlass_fp8_supported(not cuda_force_torch):
self.fp8_linear = Fp8LinearOp( self.fp8_linear = Fp8LinearOp(
@ -100,7 +130,6 @@ class TestModel(torch.nn.Module):
self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled() self.enable_quant_fp8_custom_op = self.fp8_linear.quant_fp8.enabled()
self.enable_rms_norm_custom_op = self.norm[0].enabled() self.enable_rms_norm_custom_op = self.norm[0].enabled()
self.group_shape = group_shape
def forward(self, x): def forward(self, x):
# avoid having graph input be an arg to a pattern directly # avoid having graph input be an arg to a pattern directly
@ -126,19 +155,49 @@ class TestModel(torch.nn.Module):
y4, resid = self.norm[3](x4, resid) # use resid here y4, resid = self.norm[3](x4, resid) # use resid here
return y4 return y4
def ops_in_model_before(self):
if (
self.use_aiter
and self.group_shape.is_per_group()
and current_platform.is_fp8_fnuz()
):
return [rocm_aiter_ops.get_group_quant_op()]
if self.use_aiter and self.group_shape.is_per_group():
return [torch.ops.vllm.triton_per_token_group_quant_fp8.default]
if self.use_aiter and self.use_aiter_quant_op:
return [rocm_aiter_ops.get_per_token_quant_op()]
if self.use_aiter:
return [QUANT_OPS[self.quant_key]]
if self.enable_quant_fp8_custom_op:
return [QUANT_OPS[self.quant_key]]
return [torch.ops.aten.reciprocal]
def ops_in_model_after(self): def ops_in_model_after(self):
if self.use_aiter and self.group_shape.is_per_group():
from vllm.compilation.rocm_aiter_fusion import (
AiterFusedAddRMSFp8GroupQuantPattern,
AiterRMSFp8GroupQuantPattern,
)
return [
AiterFusedAddRMSFp8GroupQuantPattern.FUSED_OP,
AiterRMSFp8GroupQuantPattern.FUSED_OP,
]
if self.use_aiter:
from vllm.compilation.rocm_aiter_fusion import (
AiterFusedAddRMSNormDynamicQuantPattern,
AiterRMSNormDynamicQuantPattern,
)
return [
AiterFusedAddRMSNormDynamicQuantPattern.FUSED_OP,
AiterRMSNormDynamicQuantPattern.FUSED_OP,
]
return [ return [
FUSED_OPS[FusedRMSQuantKey(self.quant_key, True)], FUSED_OPS[FusedRMSQuantKey(self.quant_key, True)],
FUSED_OPS[FusedRMSQuantKey(self.quant_key, False)], FUSED_OPS[FusedRMSQuantKey(self.quant_key, False)],
] ]
def ops_in_model_before(self):
return (
[QUANT_OPS[self.quant_key]]
if self.enable_quant_fp8_custom_op
else [torch.ops.aten.reciprocal]
)
def ops_in_model_before_partial(self): def ops_in_model_before_partial(self):
return ( return (
[RMS_OP, RMS_ADD_OP] [RMS_OP, RMS_ADD_OP]
@ -155,67 +214,45 @@ GROUP_SHAPES = [
] ]
class TestRmsnormGroupFp8QuantModel(torch.nn.Module): def _run_fusion_test(
def __init__(self, hidden_size: int, eps: float, **kwargs): model,
super().__init__() fusion_pass,
self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp( vllm_config,
weight_group_shape=GroupShape(128, 128), dtype,
act_quant_group_shape=GroupShape(1, 128), hidden_size,
cutlass_block_fp8_supported=False, num_tokens,
use_aiter_and_is_supported=True, ):
) """Helper function for common fusion test logic.
self.w = [
torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
for _ in range(3)
]
scale_hidden_size = (hidden_size + 128 - 1) // 128 Must be called within vllm_config context.
self.wscale = [ """
torch.rand((scale_hidden_size, scale_hidden_size), dtype=torch.float32) noop_pass = NoOpEliminationPass(vllm_config)
for _ in range(3) cleanup_pass = PostCleanupPass(vllm_config)
]
self.norm_weight = [torch.ones(hidden_size) for _ in range(4)] backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
self.eps = eps backend2 = TestBackend(noop_pass, cleanup_pass)
def forward(self, x): x = torch.rand(num_tokens, hidden_size)
# avoid having graph input be an arg to a pattern directly torch._dynamo.mark_dynamic(x, 0)
x = resid = torch.relu(x)
y = rocm_aiter_ops.rms_norm(x, self.norm_weight[0], self.eps)
x2 = self.w8a8_block_fp8_linear.apply(y, self.w[0], self.wscale[0]) model_fused = torch.compile(model, backend=backend)
# make sure resid is used for replacement to work result_fused = model_fused(x)
y2, resid = rocm_aiter_ops.rms_norm2d_with_add(
x2, resid, self.norm_weight[1], self.eps
)
x3 = self.w8a8_block_fp8_linear.apply(y2, self.w[1], self.wscale[1]) model_unfused = torch.compile(model, backend=backend2)
result_unfused = model_unfused(x)
y3, resid = rocm_aiter_ops.rms_norm2d_with_add( if dtype == torch.float16:
x3, resid, self.norm_weight[2], self.eps ATOL, RTOL = (2e-3, 2e-3)
) else:
ATOL, RTOL = (1e-2, 1e-2)
x4 = self.w8a8_block_fp8_linear.apply(y3, self.w[2], self.wscale[2]) torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL)
y4, resid = rocm_aiter_ops.rms_norm2d_with_add( assert fusion_pass.matched_count == 3
x4, resid, self.norm_weight[3], self.eps backend.check_before_ops(model.ops_in_model_before())
) backend.check_after_ops(model.ops_in_model_after())
return y4
def ops_in_model_before(self): return backend, backend2
return [
torch.ops.vllm.rocm_aiter_rms_norm,
torch.ops.vllm.rocm_aiter_group_fp8_quant,
]
def ops_in_model_before_partial(self):
return []
def ops_in_model_after(self):
return [
torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant,
torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant,
]
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@ -223,11 +260,8 @@ class TestRmsnormGroupFp8QuantModel(torch.nn.Module):
@pytest.mark.parametrize("num_tokens", [257]) @pytest.mark.parametrize("num_tokens", [257])
@pytest.mark.parametrize("eps", [1e-5, 1e-6]) @pytest.mark.parametrize("eps", [1e-5, 1e-6])
@pytest.mark.parametrize("group_shape", GROUP_SHAPES) @pytest.mark.parametrize("group_shape", GROUP_SHAPES)
@pytest.mark.parametrize( @pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False])
"model_class, enable_rms_norm_custom_op, enable_quant_fp8_custom_op", @pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False])
list(itertools.product([TestModel], [True, False], [True, False]))
+ [(TestRmsnormGroupFp8QuantModel, False, False)],
)
# cuda_force_torch used to test torch code path on platforms that # cuda_force_torch used to test torch code path on platforms that
# cutlass_fp8_supported() == True. # cutlass_fp8_supported() == True.
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -242,23 +276,13 @@ def test_fusion_rmsnorm_quant(
num_tokens, num_tokens,
eps, eps,
group_shape, group_shape,
model_class,
enable_rms_norm_custom_op, enable_rms_norm_custom_op,
enable_quant_fp8_custom_op, enable_quant_fp8_custom_op,
cuda_force_torch, cuda_force_torch,
): ):
if model_class is TestRmsnormGroupFp8QuantModel and not IS_AITER_FOUND:
pytest.skip("AITER is not supported on this GPU.")
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)
torch.manual_seed(1)
maybe_create_device_identity() # needed for certain non-cutlass fp8 paths
if not enable_quant_fp8_custom_op and group_shape.is_per_group(): if not enable_quant_fp8_custom_op and group_shape.is_per_group():
pytest.skip("Unsupported unwrapped quant fp8 op for blockwise quantization") pytest.skip("Unsupported unwrapped quant fp8 op for blockwise quantization")
# Skip test for 64-bit group shape when running with cutlass or deepgemm
if group_shape == GroupShape(1, 64) and ( if group_shape == GroupShape(1, 64) and (
cutlass_block_fp8_supported() or is_deep_gemm_supported() cutlass_block_fp8_supported() or is_deep_gemm_supported()
): ):
@ -269,6 +293,7 @@ def test_fusion_rmsnorm_quant(
custom_ops.append("+rms_norm") custom_ops.append("+rms_norm")
if enable_quant_fp8_custom_op: if enable_quant_fp8_custom_op:
custom_ops.append("+quant_fp8") custom_ops.append("+quant_fp8")
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=ModelConfig(dtype=dtype), model_config=ModelConfig(dtype=dtype),
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
@ -279,60 +304,97 @@ def test_fusion_rmsnorm_quant(
), ),
), ),
) )
with vllm.config.set_current_vllm_config(vllm_config): with vllm.config.set_current_vllm_config(vllm_config):
# Reshape pass is needed for the fusion pass to work # Setup device before model creation
noop_pass = NoOpEliminationPass(vllm_config) torch.set_default_device("cuda")
if model_class is TestRmsnormGroupFp8QuantModel: torch.set_default_dtype(dtype)
from vllm.compilation.rocm_aiter_fusion import ( torch.manual_seed(1)
RocmAiterRMSNormFp8GroupQuantFusionPass, maybe_create_device_identity()
)
fusion_pass = RocmAiterRMSNormFp8GroupQuantFusionPass(vllm_config) fusion_pass = RMSNormQuantFusionPass(vllm_config)
else: model = TestModel(
fusion_pass = RMSNormQuantFusionPass(vllm_config)
cleanup_pass = PostCleanupPass(vllm_config)
backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
backend2 = TestBackend(noop_pass, cleanup_pass)
model = model_class(
hidden_size=hidden_size, hidden_size=hidden_size,
eps=eps, eps=eps,
group_shape=group_shape, group_shape=group_shape,
use_aiter=False,
cuda_force_torch=cuda_force_torch, cuda_force_torch=cuda_force_torch,
) )
# First dimension dynamic
x = torch.rand(num_tokens, hidden_size)
torch._dynamo.mark_dynamic(x, 0)
model_fused = torch.compile(model, backend=backend) backend, _ = _run_fusion_test(
result_fused = model_fused(x) model, fusion_pass, vllm_config, dtype, hidden_size, num_tokens
)
model_unfused = torch.compile(model, backend=backend2)
result_unfused = model_unfused(x)
if dtype == torch.float16:
ATOL, RTOL = (2e-3, 2e-3)
else:
ATOL, RTOL = (1e-2, 1e-2)
torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL)
assert fusion_pass.matched_count == 3
backend.check_before_ops(model.ops_in_model_before())
backend.check_before_ops( backend.check_before_ops(
model.ops_in_model_before_partial(), fully_replaced=False model.ops_in_model_before_partial(), fully_replaced=False
) )
backend.check_after_ops(model.ops_in_model_after())
# If RMSNorm custom op is disabled (native/torch impl used), # If RMSNorm custom op is disabled (native/torch impl used),
# there's a risk that the fused add doesn't get included in the # there's a risk that the fused add doesn't get included in the
# replacement and only the rms part gets fused with quant. # replacement and only the rms part gets fused with quant.
# Hence, we check only 2 add nodes are left (final fused rmsnorm add). # Hence, we check only 2 add nodes are left (final fused rmsnorm add).
if ( if not enable_rms_norm_custom_op:
not enable_rms_norm_custom_op
and model_class is not TestRmsnormGroupFp8QuantModel
):
n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g)) n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g))
# 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each) # 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each)
assert n_add_nodes(backend.graph_pre_pass) == 7 assert n_add_nodes(backend.graph_pre_pass) == 7
assert n_add_nodes(backend.graph_post_pass) == 2 assert n_add_nodes(backend.graph_post_pass) == 2
GROUP_SHAPE_QUANT_OPS_MATCHS = [
(GroupShape.PER_TOKEN, True),
(GroupShape.PER_TOKEN, False),
(GroupShape(1, 128), True),
]
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("hidden_size", [256])
@pytest.mark.parametrize("num_tokens", [257])
@pytest.mark.parametrize("eps", [1e-5, 1e-6])
@pytest.mark.parametrize(
"group_shape, use_aiter_quant_op", GROUP_SHAPE_QUANT_OPS_MATCHS
)
@pytest.mark.skipif(
(not current_platform.is_rocm() or not IS_AITER_FOUND),
reason="Only test on ROCm with aiter package installed",
)
def test_aiter_fusion_rmsnorm_quant(
dtype: torch.dtype,
hidden_size: int,
num_tokens: int,
eps: float,
group_shape: GroupShape,
use_aiter_quant_op: bool,
monkeypatch: pytest.MonkeyPatch,
):
vllm_config = VllmConfig(
model_config=ModelConfig(dtype=dtype),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
custom_ops=["+rms_norm", "+quant_fp8"],
pass_config=PassConfig(fuse_norm_quant=True, eliminate_noops=True),
),
)
with vllm.config.set_current_vllm_config(vllm_config), monkeypatch.context() as m:
from vllm.compilation.rocm_aiter_fusion import RocmAiterRMSNormFusionPass
m.setenv("VLLM_ROCM_USE_AITER", "1")
rocm_aiter_ops.refresh_env_variables()
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)
torch.manual_seed(1)
maybe_create_device_identity()
fusion_pass = RocmAiterRMSNormFusionPass(vllm_config)
model = TestModel(
hidden_size=hidden_size,
eps=eps,
group_shape=group_shape,
use_aiter=True,
use_aiter_quant_op=use_aiter_quant_op,
)
_run_fusion_test(
model, fusion_pass, vllm_config, dtype, hidden_size, num_tokens
)

View File

@ -8,7 +8,7 @@ import pytest
import pytest_asyncio import pytest_asyncio
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from vllm.multimodal.utils import encode_audio_base64, fetch_audio from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
@ -53,6 +53,14 @@ def base64_encoded_audio() -> dict[str, str]:
} }
@pytest.fixture(scope="session")
def url_encoded_audio() -> dict[str, str]:
return {
audio_url: encode_audio_url(*fetch_audio(audio_url))
for audio_url in TEST_AUDIO_URLS
}
def dummy_messages_from_audio_url( def dummy_messages_from_audio_url(
audio_urls: str | list[str], audio_urls: str | list[str],
content_text: str = "What's happening in this audio?", content_text: str = "What's happening in this audio?",
@ -149,11 +157,9 @@ async def test_single_chat_session_audio_base64encoded(
client: openai.AsyncOpenAI, client: openai.AsyncOpenAI,
model_name: str, model_name: str,
audio_url: str, audio_url: str,
base64_encoded_audio: dict[str, str], url_encoded_audio: dict[str, str],
): ):
messages = dummy_messages_from_audio_url( messages = dummy_messages_from_audio_url(url_encoded_audio[audio_url])
f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
)
# test single completion # test single completion
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
@ -313,7 +319,7 @@ async def test_chat_streaming_input_audio(
"format": "wav", "format": "wav",
}, },
}, },
{"type": "text", "text": "What's happening in this audio?"}, {"type": "text", "text": "What's a short title for this audio?"},
], ],
} }
] ]

View File

@ -7,7 +7,7 @@ import openai
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from vllm.multimodal.utils import encode_video_base64, fetch_video from vllm.multimodal.utils import encode_video_url, fetch_video
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
@ -48,9 +48,9 @@ async def client(server):
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def base64_encoded_video() -> dict[str, str]: def url_encoded_video() -> dict[str, str]:
return { return {
video_url: encode_video_base64(fetch_video(video_url)[0]) video_url: encode_video_url(fetch_video(video_url)[0])
for video_url in TEST_VIDEO_URLS for video_url in TEST_VIDEO_URLS
} }
@ -175,11 +175,9 @@ async def test_single_chat_session_video_base64encoded(
client: openai.AsyncOpenAI, client: openai.AsyncOpenAI,
model_name: str, model_name: str,
video_url: str, video_url: str,
base64_encoded_video: dict[str, str], url_encoded_video: dict[str, str],
): ):
messages = dummy_messages_from_video_url( messages = dummy_messages_from_video_url(url_encoded_video[video_url])
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
)
# test single completion # test single completion
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
@ -223,11 +221,9 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
client: openai.AsyncOpenAI, client: openai.AsyncOpenAI,
model_name: str, model_name: str,
video_url: str, video_url: str,
base64_encoded_video: dict[str, str], url_encoded_video: dict[str, str],
): ):
messages = dummy_messages_from_video_url( messages = dummy_messages_from_video_url(url_encoded_video[video_url])
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
)
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,

View File

@ -9,7 +9,7 @@ import pytest_asyncio
from transformers import AutoProcessor from transformers import AutoProcessor
from vllm.multimodal.base import MediaWithBytes from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_base64, fetch_image from vllm.multimodal.utils import encode_image_url, fetch_image
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
@ -35,7 +35,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
], ],
[ [
"The image shows a Venn diagram with three over", "The image shows a Venn diagram with three over",
"The image shows a colorful Venn diagram with", "The image displays a Venn diagram with three over",
], ],
[ [
"This image displays a gradient of colors ranging from", "This image displays a gradient of colors ranging from",
@ -70,11 +70,9 @@ async def client(server):
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]: def url_encoded_image(local_asset_server) -> dict[str, str]:
return { return {
image_asset: encode_image_base64( image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
local_asset_server.get_image_asset(image_asset)
)
for image_asset in TEST_IMAGE_ASSETS for image_asset in TEST_IMAGE_ASSETS
} }
@ -234,11 +232,11 @@ async def test_single_chat_session_image_base64encoded(
model_name: str, model_name: str,
raw_image_url: str, raw_image_url: str,
image_url: str, image_url: str,
base64_encoded_image: dict[str, str], url_encoded_image: dict[str, str],
): ):
content_text = "What's in this image?" content_text = "What's in this image?"
messages = dummy_messages_from_image_url( messages = dummy_messages_from_image_url(
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", url_encoded_image[raw_image_url],
content_text, content_text,
) )
@ -288,15 +286,13 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
client: openai.AsyncOpenAI, client: openai.AsyncOpenAI,
model_name: str, model_name: str,
image_idx: int, image_idx: int,
base64_encoded_image: dict[str, str], url_encoded_image: dict[str, str],
): ):
# NOTE: This test also validates that we pass MM data through beam search # NOTE: This test also validates that we pass MM data through beam search
raw_image_url = TEST_IMAGE_ASSETS[image_idx] raw_image_url = TEST_IMAGE_ASSETS[image_idx]
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx] expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
messages = dummy_messages_from_image_url( messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url])
f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
)
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,

View File

@ -10,7 +10,7 @@ from transformers import AutoProcessor
from tests.utils import VLLM_PATH, RemoteOpenAIServer from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.multimodal.base import MediaWithBytes from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.utils import encode_image_base64, fetch_image from vllm.multimodal.utils import fetch_image
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
MAXIMUM_IMAGES = 2 MAXIMUM_IMAGES = 2
@ -48,14 +48,6 @@ def server():
yield remote_server yield remote_server
@pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]:
return {
image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
for image_url in TEST_IMAGE_ASSETS
}
def get_hf_prompt_tokens(model_name, content, image_url): def get_hf_prompt_tokens(model_name, content, image_url):
processor = AutoProcessor.from_pretrained( processor = AutoProcessor.from_pretrained(
model_name, trust_remote_code=True, num_crops=4 model_name, trust_remote_code=True, num_crops=4

View File

@ -0,0 +1,352 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import patch
import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
from vllm.entrypoints.score_utils import get_score_prompt
from vllm.inputs import TokensPrompt
from vllm.tokenizers import get_tokenizer
# A cross-encoder model for testing
CROSS_ENCODER_MODEL_ID = "cross-encoder/ms-marco-MiniLM-L-6-v2"
def assert_prompt_tokenization_consistent(
tokenizer, full_prompt, engine_prompt, add_special_tokens=True
):
"""Verify that engine_prompt token_ids match tokenizing full_prompt."""
expected_ids = tokenizer(full_prompt, add_special_tokens=add_special_tokens)[
"input_ids"
]
actual_ids = engine_prompt["prompt_token_ids"]
assert actual_ids == expected_ids, (
f"Token IDs don't match.\nExpected: {expected_ids}\nActual: {actual_ids}"
)
@pytest.fixture(scope="module")
def cross_encoder_model_config():
return ModelConfig(
CROSS_ENCODER_MODEL_ID,
runner="pooling",
)
@pytest.fixture(scope="module")
def cross_encoder_tokenizer(cross_encoder_model_config):
return get_tokenizer(
CROSS_ENCODER_MODEL_ID,
trust_remote_code=cross_encoder_model_config.trust_remote_code,
)
@pytest.fixture(scope="module")
def llm_reranker_model_config():
"""Model config for LLM-as-reranker style (no pad token)."""
config = ModelConfig(
CROSS_ENCODER_MODEL_ID,
runner="pooling",
)
# use_pad_token is a property that reads from hf_config,
# so we set it there to override the default (True)
config.hf_config.use_pad_token = False
return config
@pytest.fixture
def tokenization_kwargs():
"""Common tokenization kwargs used across tests."""
return {"add_special_tokens": True, "return_tensors": None}
@pytest.fixture
def mock_model_with_score_template():
"""Mock model class that supports score template and tracks post_process calls."""
class MockModelWithScoreTemplate:
supports_score_template = True
post_process_called: list[TokensPrompt] = []
@staticmethod
def get_score_template(p1: str, p2: str) -> str:
return f"[QUERY]{p1}[SEP][DOC]{p2}"
@staticmethod
def post_process_tokens(prompt: TokensPrompt) -> None:
MockModelWithScoreTemplate.post_process_called.append(prompt)
return MockModelWithScoreTemplate
@pytest.fixture
def mock_model_no_score_template():
"""Mock model class that does not support score template."""
class MockModelNoScoreTemplate:
supports_score_template = False
return MockModelNoScoreTemplate
class TestGetScorePrompt:
"""Tests for the get_score_prompt function."""
def test_tokenization_kwargs_passed_through(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
):
"""Test that tokenization kwargs are properly passed through."""
data_1 = "Query text"
data_2 = "Document text"
# Test with truncation - custom kwargs for this test
custom_tokenization_kwargs = {
"add_special_tokens": True,
"return_tensors": None,
"truncation": True,
"max_length": 20,
}
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config,
cross_encoder_tokenizer,
custom_tokenization_kwargs,
data_1,
data_2,
)
assert isinstance(full_prompt, str)
assert "prompt_token_ids" in engine_prompt
# With max_length=20 and truncation, should not exceed this
assert len(engine_prompt["prompt_token_ids"]) <= 20
# Since truncation was applied, token_ids should be a prefix of full encoding
full_ids = cross_encoder_tokenizer(full_prompt, add_special_tokens=True)[
"input_ids"
]
actual_ids = engine_prompt["prompt_token_ids"]
assert full_ids[: len(actual_ids)] == actual_ids, (
f"Token IDs are not a prefix of full encoding.\n"
f"Full IDs: {full_ids}\n"
f"Actual IDs: {actual_ids}"
)
def test_model_supports_score_template(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test when model supports score template (no score_template arg)."""
with patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query text",
"document text",
)
assert full_prompt == "[QUERY]query text[SEP][DOC]document text"
assert "prompt_token_ids" in engine_prompt
assert len(engine_prompt["prompt_token_ids"]) > 0
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_model_supports_score_template_but_custom_template_provided(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test when model supports score template but custom template is provided."""
template = (
'TEMPLATE_USED {{ messages[0]["content"] }} {{ messages[1]["content"] }}'
)
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"doc",
score_template=template, # Providing a template
)
assert "prompt_token_ids" in engine_prompt
assert full_prompt == "TEMPLATE_USED query doc"
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_not_using_default_template(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
# FIXME: Models implementing SupportsScoreTemplate must use their custom
# template implementation by default to preserve existing functionality.
# Attempting to use tokenizer_config.json templates would most likely break
# these models, as often they just inherit the template from the original LLM.
# CLI --chat-template overrides are still supported.
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
return_value="test querytest doc",
),
):
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"test query",
"test doc",
)
assert full_prompt == "test querytest doc"
assert "prompt_token_ids" in engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_fallback_with_pad_token(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
"""Test fallback path when ChatTemplateResolutionError
and use_pad_token=True."""
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config, # use_pad_token=True
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"document",
)
assert "prompt_token_ids" in engine_prompt
# Should have token_type_ids from text_pair encoding
assert "token_type_ids" in engine_prompt
assert "query" in full_prompt
assert "document" in full_prompt
assert full_prompt != "querydocument"
assert (
engine_prompt["prompt_token_ids"]
== cross_encoder_tokenizer(
"query", text_pair="document", add_special_tokens=True
)["input_ids"]
)
# FIXME(?): add_special_tokens=False is needed because in this case
# full_prompt is obtained by decoding the tokenized prompt, which includes
# special tokens and we would get duplicated special tokens otherwise.
# This is inconsistent with other cases.
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer,
full_prompt,
engine_prompt,
add_special_tokens=False,
)
def test_fallback_without_pad_token(
self,
llm_reranker_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_no_score_template,
):
"""Test fallback path when ChatTemplateResolutionError
and use_pad_token=False."""
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_no_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
llm_reranker_model_config, # use_pad_token=False
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"document",
)
assert full_prompt == "querydocument"
assert "prompt_token_ids" in engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)
def test_post_process_tokens_called(
self,
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
mock_model_with_score_template,
):
"""Test that post_process_tokens is called on the engine prompt."""
# Reset the call tracker
mock_model_with_score_template.post_process_called.clear()
with (
patch(
"vllm.model_executor.model_loader.get_model_cls",
return_value=mock_model_with_score_template,
),
patch(
"vllm.entrypoints.score_utils.apply_hf_chat_template",
side_effect=ChatTemplateResolutionError("No template"),
),
):
full_prompt, engine_prompt = get_score_prompt(
cross_encoder_model_config,
cross_encoder_tokenizer,
tokenization_kwargs,
"query",
"doc",
)
# post_process_tokens should have been called once
assert len(mock_model_with_score_template.post_process_called) == 1
assert mock_model_with_score_template.post_process_called[0] is engine_prompt
assert_prompt_tokenization_consistent(
cross_encoder_tokenizer, full_prompt, engine_prompt
)

View File

@ -25,9 +25,9 @@ from vllm.entrypoints.chat_utils import (
) )
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
from vllm.multimodal.utils import ( from vllm.multimodal.utils import (
encode_audio_base64, encode_audio_url,
encode_image_base64, encode_image_url,
encode_video_base64, encode_video_url,
) )
from vllm.tokenizers import get_tokenizer from vllm.tokenizers import get_tokenizer
from vllm.tokenizers.mistral import MistralTokenizer from vllm.tokenizers.mistral import MistralTokenizer
@ -141,22 +141,19 @@ def mistral_model_config():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def image_url(): def image_url():
image = ImageAsset("cherry_blossom") image = ImageAsset("cherry_blossom")
base64 = encode_image_base64(image.pil_image) return encode_image_url(image.pil_image)
return f"data:image/jpeg;base64,{base64}"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def video_url(): def video_url():
video = VideoAsset("baby_reading", 1) video = VideoAsset("baby_reading", 1)
base64 = encode_video_base64(video.np_ndarrays) return encode_video_url(video.np_ndarrays)
return f"data:video/jpeg;base64,{base64}"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def audio_url(): def audio_url():
audio = AudioAsset("mary_had_lamb") audio = AudioAsset("mary_had_lamb")
base64 = encode_audio_base64(*audio.audio_and_sample_rate) return encode_audio_url(*audio.audio_and_sample_rate)
return f"data:audio/ogg;base64,{base64}"
def _assert_mm_data_is_image_input( def _assert_mm_data_is_image_input(

View File

@ -0,0 +1,11 @@
model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8"
accuracy_threshold: 0.85
num_questions: 1319
num_fewshot: 5
server_args: >-
--max-model-len 4096
--tensor-parallel-size 2
--enable-expert-parallel
--async-scheduling
env:
VLLM_USE_FLASHINFER_MOE_FP8: "1"

View File

@ -4,3 +4,4 @@ Qwen1.5-MoE-W4A16-CT.yaml
DeepSeek-V2-Lite-Instruct-FP8.yaml DeepSeek-V2-Lite-Instruct-FP8.yaml
Qwen3-30B-A3B-NVFP4.yaml Qwen3-30B-A3B-NVFP4.yaml
Qwen3-Next-80B-A3B-NVFP4-EP2.yaml Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
Qwen3-Next-FP8-EP2.yaml

View File

@ -71,6 +71,7 @@ def test_gsm8k_correctness(config_filename):
print(f"Number of questions: {eval_config['num_questions']}") print(f"Number of questions: {eval_config['num_questions']}")
print(f"Number of few-shot examples: {eval_config['num_fewshot']}") print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
print(f"Server args: {' '.join(server_args)}") print(f"Server args: {' '.join(server_args)}")
print(f"Environment variables: {env_dict}")
# Launch server and run evaluation # Launch server and run evaluation
with RemoteOpenAIServer( with RemoteOpenAIServer(

View File

@ -40,93 +40,6 @@ KV_CACHE_DTYPE = ["auto", "fp8"]
RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"] RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"]
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@torch.inference_mode()
def test_copy_blocks(
kv_cache_factory,
num_mappings: int,
num_layers: int,
num_heads: int,
head_size: int,
block_size: int,
num_blocks: int,
dtype: torch.dtype,
seed: int,
kv_cache_dtype: str,
device: str,
) -> None:
if kv_cache_dtype == "fp8" and head_size % 16:
pytest.skip()
current_platform.seed_everything(seed)
torch.set_default_device(device)
torch.cuda.set_device(device)
# Generate random block mappings where each source block is mapped to two
# destination blocks.
assert 2 * num_mappings <= num_blocks
src_blocks = random.sample(range(num_blocks), num_mappings)
remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
dst_blocks = random.sample(remaining_blocks, 2 * num_mappings)
block_mapping: list[tuple[int, int]] = []
for i in range(num_mappings):
src = src_blocks[i]
dst1 = dst_blocks[2 * i]
dst2 = dst_blocks[2 * i + 1]
block_mapping.append((src, dst1))
block_mapping.append((src, dst2))
# Create the KV caches.
key_caches, value_caches = kv_cache_factory(
num_blocks,
block_size,
num_layers,
num_heads,
head_size,
kv_cache_dtype,
dtype,
seed,
device,
)
# Clone the KV caches.
cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
# Call the copy blocks kernel.
block_mapping_tensor = torch.tensor(
block_mapping, dtype=torch.int64, device=device
).view(-1, 2)
opcheck(
torch.ops._C_cache_ops.copy_blocks,
(key_caches, value_caches, block_mapping_tensor),
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
cond=(head_size == HEAD_SIZES[0]),
)
ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
# Run the reference implementation.
for src, dst in block_mapping:
for cloned_key_cache in cloned_key_caches:
cloned_key_cache[dst].copy_(cloned_key_cache[src])
for cloned_value_cache in cloned_value_caches:
cloned_value_cache[dst].copy_(cloned_value_cache[src])
# Compare the results.
for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
torch.testing.assert_close(key_cache, cloned_key_cache)
for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches):
torch.testing.assert_close(value_cache, cloned_value_cache)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("head_size", HEAD_SIZES)
@ -763,73 +676,6 @@ def test_concat_and_cache_ds_mla(
torch.testing.assert_close(kv_rope, ref_rope, atol=0.001, rtol=0.1) torch.testing.assert_close(kv_rope, ref_rope, atol=0.001, rtol=0.1)
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@torch.inference_mode()
def test_copy_blocks_mla(
kv_lora_rank: int,
qk_rope_head_dim: int,
block_size: int,
num_blocks: int,
num_layers: int,
dtype: torch.dtype,
seed: int,
device: str,
kv_cache_dtype: str,
) -> None:
current_platform.seed_everything(seed)
torch.set_default_device(device)
torch.cuda.set_device(device)
entry_size = kv_lora_rank + qk_rope_head_dim
kv_caches = []
for _ in range(num_layers):
kv_cache = _create_mla_cache(
num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
)
_fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype)
kv_caches.append(kv_cache)
ref_caches = [kv_cache.clone() for kv_cache in kv_caches]
num_mappings = min(2, num_blocks // 2)
src_blocks = random.sample(range(num_blocks), num_mappings)
remaining = list(set(range(num_blocks)) - set(src_blocks))
dst_blocks = random.sample(remaining, 2 * num_mappings)
block_mapping = []
for i in range(num_mappings):
src = src_blocks[i]
dst1 = dst_blocks[2 * i]
dst2 = dst_blocks[2 * i + 1]
block_mapping.append((src, dst1))
block_mapping.append((src, dst2))
block_mapping_tensor = torch.tensor(
block_mapping, dtype=torch.int64, device=device
).view(-1, 2)
for src, dst in block_mapping:
for ref_cache in ref_caches:
ref_cache[dst].copy_(ref_cache[src])
opcheck(
torch.ops._C_cache_ops.copy_blocks_mla,
(kv_caches, block_mapping_tensor),
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
)
ops.copy_blocks_mla(kv_caches, block_mapping_tensor)
for kv_cache, ref_cache in zip(kv_caches, ref_caches):
torch.testing.assert_close(kv_cache, ref_cache)
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS) @pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS) @pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA) @pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)

View File

@ -60,6 +60,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
from vllm.model_executor.models.mixtral import MixtralMoE from vllm.model_executor.models.mixtral import MixtralMoE
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types from vllm.scalar_type import ScalarType, scalar_types
from vllm.v1.worker.workspace import init_workspace_manager
NUM_EXPERTS = [8, 64, 192] NUM_EXPERTS = [8, 64, 192]
EP_SIZE = [1, 4] EP_SIZE = [1, 4]
@ -487,6 +488,7 @@ def test_mixtral_moe(
monkeypatch.setenv("MASTER_ADDR", "localhost") monkeypatch.setenv("MASTER_ADDR", "localhost")
monkeypatch.setenv("MASTER_PORT", "12345") monkeypatch.setenv("MASTER_PORT", "12345")
init_distributed_environment() init_distributed_environment()
init_workspace_manager(torch.cuda.current_device())
# Instantiate our and huggingface's MoE blocks # Instantiate our and huggingface's MoE blocks
vllm_config.compilation_config.static_forward_context = dict() vllm_config.compilation_config.static_forward_context = dict()
@ -533,6 +535,11 @@ def test_mixtral_moe(
torch.cuda.synchronize() torch.cuda.synchronize()
torch.cuda.empty_cache() torch.cuda.empty_cache()
# FIXME (zyongye) fix this after we move self.kernel
# assignment in FusedMoE.__init__
vllm_moe.experts.quant_method.process_weights_after_loading(vllm_moe.experts)
# Run forward passes for both MoE blocks # Run forward passes for both MoE blocks
hf_states, _ = hf_moe.forward(hf_inputs) hf_states, _ = hf_moe.forward(hf_inputs)
vllm_states = vllm_moe.forward(vllm_inputs) vllm_states = vllm_moe.forward(vllm_inputs)

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import tempfile import tempfile
from pathlib import Path
import mteb import mteb
import numpy as np import numpy as np
@ -19,6 +20,11 @@ from tests.models.utils import (
get_vllm_extra_kwargs, get_vllm_extra_kwargs,
) )
template_home = (
Path(__file__).parent.parent.parent.parent.parent
/ "examples/pooling/score/template"
)
# Most embedding models on the STS12 task (See #17175): # Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype # - Model implementation and minor changes in tensor dtype
# results in differences less than 1e-4 # results in differences less than 1e-4
@ -102,30 +108,6 @@ class VllmMtebEncoder(mteb.EncoderProtocol):
return sim return sim
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
corpus = [text for batch in inputs2 for text in batch["text"]]
outputs = self.llm.score(
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
)
scores = np.array(outputs)
return scores
class OpenAIClientMtebEncoder(VllmMtebEncoder): class OpenAIClientMtebEncoder(VllmMtebEncoder):
def __init__(self, model_name: str, client): def __init__(self, model_name: str, client):
self.model_name = model_name self.model_name = model_name
@ -153,6 +135,35 @@ class OpenAIClientMtebEncoder(VllmMtebEncoder):
return embeds return embeds
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
self.chat_template: str | None = getattr(vllm_model, "chat_template", None)
def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
corpus = [text for batch in inputs2 for text in batch["text"]]
outputs = self.llm.score(
queries,
corpus,
truncate_prompt_tokens=-1,
use_tqdm=False,
chat_template=self.chat_template,
)
scores = np.array(outputs)
return scores
class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol): class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta mteb_model_meta = _empty_model_meta
@ -387,6 +398,11 @@ def mteb_test_rerank_models(
== model_info.default_pooling_type == model_info.default_pooling_type
) )
chat_template: str | None = None
if model_info.chat_template_name is not None:
chat_template = (template_home / model_info.chat_template_name).read_text()
vllm_model.chat_template = chat_template
vllm_main_score = run_mteb_rerank( vllm_main_score = run_mteb_rerank(
vllm_mteb_encoder(vllm_model), vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS, tasks=MTEB_RERANK_TASKS,

View File

@ -0,0 +1,42 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.models.utils import (
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
LASTPoolingRerankModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
EMBEDDING_MODELS = [
LASTPoolingEmbedModelInfo(
"nvidia/llama-nemotron-embed-1b-v2",
architecture="LlamaBidirectionalModel",
mteb_score=0.689164662128673,
)
]
RERANK_MODELS = [
LASTPoolingRerankModelInfo(
"nvidia/llama-nemotron-rerank-1b-v2",
architecture="LlamaBidirectionalForSequenceClassification",
chat_template_name="nemotron-rerank.jinja",
mteb_score=0.33994,
),
]
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(
hf_runner, vllm_runner, model_info: RerankModelInfo
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)

View File

@ -19,7 +19,7 @@ def pytest_collection_modifyitems(config, items):
return return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch.backends.cuda.enable_flash_sdp(False) torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False) torch.backends.cuda.enable_mem_efficient_sdp(False)

View File

@ -513,6 +513,7 @@ VLM_TEST_SETTINGS = {
max_model_len=8192, max_model_len=8192,
use_tokenizer_eos=True, use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner, patch_hf_runner=model_utils.internvl_patch_hf_runner,
num_logprobs=10 if current_platform.is_rocm() else 5,
), ),
"intern_vl-hf": VLMTestInfo( "intern_vl-hf": VLMTestInfo(
models=["OpenGVLab/InternVL3-1B-hf"], models=["OpenGVLab/InternVL3-1B-hf"],

View File

@ -8,7 +8,7 @@ from PIL.Image import Image
from transformers import AutoProcessor from transformers import AutoProcessor
from vllm import LLM, EngineArgs, SamplingParams from vllm import LLM, EngineArgs, SamplingParams
from vllm.multimodal.utils import encode_image_base64 from vllm.multimodal.utils import encode_image_url
MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview" MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
@ -31,10 +31,7 @@ def test_keye_vl(
question: str, question: str,
): ):
images = [asset.pil_image for asset in image_assets] images = [asset.pil_image for asset in image_assets]
image_urls = [encode_image_url(image) for image in images]
image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
]
engine_args = EngineArgs( engine_args = EngineArgs(
model=MODEL_NAME, model=MODEL_NAME,

View File

@ -15,7 +15,7 @@ from transformers import AutoProcessor
from vllm import LLM, EngineArgs, SamplingParams from vllm import LLM, EngineArgs, SamplingParams
from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.multimodal.utils import encode_image_base64 from vllm.multimodal.utils import encode_image_url
from vllm.multimodal.video import sample_frames_from_video from vllm.multimodal.video import sample_frames_from_video
from vllm.platforms import current_platform from vllm.platforms import current_platform
@ -178,8 +178,7 @@ def build_dots_ocr_prompt(images, config):
"""Build Dots.OCR specific prompt with OCR instructions.""" """Build Dots.OCR specific prompt with OCR instructions."""
# Use only stop_sign image for Dots.OCR # Use only stop_sign image for Dots.OCR
image = images[0] # Already filtered to stop_sign image = images[0] # Already filtered to stop_sign
image_url = encode_image_url(image)
image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
placeholders = [{"type": "image_url", "image_url": {"url": image_url}}] placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
messages = [ messages = [
@ -204,9 +203,7 @@ def build_processor_prompt(images, config):
config["model_name"], trust_remote_code=True config["model_name"], trust_remote_code=True
) )
image_urls = [ image_urls = [encode_image_url(img) for img in images]
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
]
placeholders = [{"type": "image", "image": url} for url in image_urls] placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [ messages = [
{ {
@ -225,9 +222,7 @@ def build_processor_prompt(images, config):
def build_ovis_prompt(images, config): def build_ovis_prompt(images, config):
"""Build Ovis2.5 specific prompt with custom format.""" """Build Ovis2.5 specific prompt with custom format."""
image_urls = [ image_urls = [encode_image_url(img) for img in images]
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
]
placeholders = "\n".join( placeholders = "\n".join(
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)

View File

@ -111,4 +111,5 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.message.content == "In the first audio clip, you hear a brief"
assert choice.finish_reason == "length" assert choice.finish_reason == "length"

View File

@ -215,7 +215,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True, trust_remote_code=True,
), ),
"CwmForCausalLM": _HfExamplesInfo("facebook/cwm", min_transformers_version="4.58"), "CwmForCausalLM": _HfExamplesInfo("facebook/cwm", min_transformers_version="4.58"),
"DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"), # FIXME: databricks/dbrx-instruct has been deleted
"DbrxForCausalLM": _HfExamplesInfo(
"databricks/dbrx-instruct", is_available_online=False
),
"DeciLMForCausalLM": _HfExamplesInfo( "DeciLMForCausalLM": _HfExamplesInfo(
"nvidia/Llama-3_3-Nemotron-Super-49B-v1", "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
trust_remote_code=True, trust_remote_code=True,
@ -366,7 +369,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
{"tiny": "TitanML/tiny-mixtral"}, {"tiny": "TitanML/tiny-mixtral"},
), ),
"MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False), "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
"MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"), # FIXME: mosaicml/mpt-7b has been deleted
"MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b", is_available_online=False),
"NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"), "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
"NemotronHForCausalLM": _HfExamplesInfo( "NemotronHForCausalLM": _HfExamplesInfo(
"nvidia/Nemotron-H-8B-Base-8K", trust_remote_code=True "nvidia/Nemotron-H-8B-Base-8K", trust_remote_code=True
@ -484,6 +488,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
), ),
"JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"), "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"),
"LlamaModel": _HfExamplesInfo("llama", is_available_online=False), "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
"LlamaBidirectionalModel": _HfExamplesInfo(
"nvidia/llama-nemotron-embed-1b-v2", trust_remote_code=True
),
"MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
"ModernBertModel": _HfExamplesInfo( "ModernBertModel": _HfExamplesInfo(
"Alibaba-NLP/gte-modernbert-base", trust_remote_code=True "Alibaba-NLP/gte-modernbert-base", trust_remote_code=True
@ -550,6 +557,9 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
trust_remote_code=True, trust_remote_code=True,
hf_overrides={"architectures": ["GteNewForSequenceClassification"]}, hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
), ),
"LlamaBidirectionalForSequenceClassification": _HfExamplesInfo(
"nvidia/llama-nemotron-rerank-1b-v2", trust_remote_code=True
),
"ModernBertForSequenceClassification": _HfExamplesInfo( "ModernBertForSequenceClassification": _HfExamplesInfo(
"Alibaba-NLP/gte-reranker-modernbert-base" "Alibaba-NLP/gte-reranker-modernbert-base"
), ),
@ -850,6 +860,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
# disable this temporarily until we support HF format # disable this temporarily until we support HF format
is_available_online=False, is_available_online=False,
), ),
"VoxtralStreamingGeneration": _HfExamplesInfo(
"<place-holder>",
# disable this temporarily until we support HF format
is_available_online=False,
),
# [Encoder-decoder] # [Encoder-decoder]
"WhisperForConditionalGeneration": _HfExamplesInfo( "WhisperForConditionalGeneration": _HfExamplesInfo(
"openai/whisper-large-v3-turbo", "openai/whisper-large-v3-turbo",

View File

@ -38,7 +38,7 @@ def test_inference(
max_num_seqs=32, max_num_seqs=32,
default_torch_num_threads=1, default_torch_num_threads=1,
) as vllm_model: ) as vllm_model:
vllm_output = vllm_model.llm.encode(prompt) vllm_output = vllm_model.llm.encode(prompt, pooling_task="plugin")
assert torch.equal( assert torch.equal(
torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False) torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False)
) )

View File

@ -399,6 +399,7 @@ class LASTPoolingEmbedModelInfo(EmbedModelInfo):
@dataclass @dataclass
class RerankModelInfo(ModelInfo): class RerankModelInfo(ModelInfo):
mteb_score: float | None = None mteb_score: float | None = None
chat_template_name: str | None = None
@dataclass @dataclass

View File

@ -38,7 +38,8 @@ TOKENIZERS = [
"EleutherAI/gpt-j-6b", "EleutherAI/gpt-j-6b",
"EleutherAI/pythia-70m", "EleutherAI/pythia-70m",
"bigscience/bloom-560m", "bigscience/bloom-560m",
"mosaicml/mpt-7b", # FIXME: mosaicml/mpt-7b has been deleted
# "mosaicml/mpt-7b",
"tiiuae/falcon-7b", "tiiuae/falcon-7b",
"meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Llama-3.2-1B-Instruct",
"codellama/CodeLlama-7b-hf", "codellama/CodeLlama-7b-hf",

View File

@ -281,6 +281,8 @@ def test_extract_tool_calls_pre_v11_tokenizer(
"single_tool_add", "single_tool_add",
"single_tool_weather", "single_tool_weather",
"multiple_tool_calls", "multiple_tool_calls",
"complex",
"wrong_json",
], ],
argnames=["model_output", "expected_tool_calls", "expected_content"], argnames=["model_output", "expected_tool_calls", "expected_content"],
argvalues=[ argvalues=[
@ -326,6 +328,36 @@ def test_extract_tool_calls_pre_v11_tokenizer(
], ],
None, None,
), ),
(
# Complex
"""hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="bash",
arguments=json.dumps(
{"command": "print(\"hello world!\")\nre.compile(r'{}')"}
)[:-2],
)
)
],
"hi{hi",
),
(
# Wrong json
"""hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
[
ToolCall(
function=FunctionCall(
name="bash",
arguments=json.dumps(
{"command": "print(\"hello world!\")\nre.compile(r'{}')"}
),
)
)
],
"hi{hi",
),
], ],
) )
def test_extract_tool_calls( def test_extract_tool_calls(
@ -673,7 +705,7 @@ def test_extract_tool_calls_streaming(
), ),
( (
# Complex # Complex
"""[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501 """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501
[ [
ToolCall( ToolCall(
function=FunctionCall( function=FunctionCall(
@ -684,7 +716,7 @@ def test_extract_tool_calls_streaming(
) )
) )
], ],
"", "hi{hi",
), ),
], ],
) )

View File

@ -106,6 +106,7 @@ class RemoteOpenAIServer:
env.update(env_dict) env.update(env_dict)
serve_cmd = ["vllm", "serve", model, *vllm_serve_args] serve_cmd = ["vllm", "serve", model, *vllm_serve_args]
print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}") print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}")
print(f"Environment variables: {env}")
self.proc: subprocess.Popen = subprocess.Popen( self.proc: subprocess.Popen = subprocess.Popen(
serve_cmd, serve_cmd,
env=env, env=env,

View File

@ -1356,6 +1356,69 @@ def test_kv_cache_events(blocks_to_cache: int):
assert len(manager.block_pool.cached_block_hash_to_block) == 0 assert len(manager.block_pool.cached_block_hash_to_block) == 0
def test_null_parent_block_hash():
block_size = 1
num_cached_blocks = 2
num_full_blocks = 4
pool = BlockPool(
num_gpu_blocks=8,
enable_caching=True,
hash_block_size=block_size,
enable_kv_cache_events=True,
)
req = make_request(
"req_null_parent",
prompt_token_ids=[10, 11, 12, 13],
block_size=block_size,
hash_fn=sha256,
)
assert len(req.block_hashes) == num_full_blocks
# Physical parent is `null_block` (no hash), while the logical parent hash
# still exists in `request.block_hashes[num_cached_blocks - 1]`.
assert pool.null_block.block_hash is None
new_blocks = pool.get_new_blocks(num_full_blocks - 1)
blocks = [
new_blocks[: num_cached_blocks - 1],
pool.null_block, # physical parent
*new_blocks[num_cached_blocks - 1 :],
]
pool.cache_full_blocks(
request=req,
blocks=blocks,
num_cached_blocks=num_cached_blocks,
num_full_blocks=num_full_blocks,
block_size=block_size,
kv_cache_group_id=0,
)
events = pool.take_events()
assert len(events) == 1
event = events[0]
assert isinstance(event, BlockStored)
expected_parent = kv_cache_utils.maybe_convert_block_hash(
req.block_hashes[num_cached_blocks - 1]
)
assert event.parent_block_hash == expected_parent
assert event.parent_block_hash is not None
expected_new_hashes = [
kv_cache_utils.maybe_convert_block_hash(h)
for h in req.block_hashes[num_cached_blocks:num_full_blocks]
]
assert event.block_hashes == expected_new_hashes
# Ensure we didn't accidentally assign a hash to the null block.
assert pool.null_block.block_hash is None
# Sanity check: newly cached physical blocks should have hashes assigned.
assert blocks[num_cached_blocks].block_hash is not None
assert blocks[num_full_blocks - 1].block_hash is not None
@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10]) @pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
def test_kv_cache_events_with_lora(blocks_to_cache: int): def test_kv_cache_events_with_lora(blocks_to_cache: int):
"""Test BlockStored events contain correct lora_id when using LoRA requests.""" """Test BlockStored events contain correct lora_id when using LoRA requests."""

View File

@ -31,7 +31,7 @@ import openai
import requests import requests
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.multimodal.utils import encode_image_base64 from vllm.multimodal.utils import encode_image_url
MAX_OUTPUT_LEN = 256 MAX_OUTPUT_LEN = 256
@ -49,9 +49,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
"content": [ "content": [
{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {"url": encode_image_url(image_1)},
"url": f"data:image;base64,{encode_image_base64(image_1)}"
},
}, },
{"type": "text", "text": "What's in this image?"}, {"type": "text", "text": "What's in this image?"},
], ],
@ -66,9 +64,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
"content": [ "content": [
{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {"url": encode_image_url(image_2)},
"url": f"data:image;base64,{encode_image_base64(image_2)}"
},
}, },
{ {
"type": "image_url", "type": "image_url",

View File

@ -260,7 +260,7 @@ async def test_multi_abort(output_kind: RequestOutputKind):
# Use multi-abort to abort multiple requests at once # Use multi-abort to abort multiple requests at once
abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT] abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
await engine.abort(abort_request_ids) await engine.abort(abort_request_ids, internal=False)
# Wait for all tasks to complete # Wait for all tasks to complete
results = await asyncio.gather(*tasks, return_exceptions=True) results = await asyncio.gather(*tasks, return_exceptions=True)
@ -609,7 +609,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind):
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
# Abort the request # Abort the request
await engine.abort(request_id) await engine.abort(request_id, internal=False)
# Wait for generation to complete and return final output # Wait for generation to complete and return final output
final_output = await generated final_output = await generated

View File

@ -40,10 +40,16 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
PROMPT = "I am Gyoubu Masataka Oniwa" PROMPT = "I am Gyoubu Masataka Oniwa"
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
_REQUEST_COUNTER = 0
def make_request() -> EngineCoreRequest: def make_request() -> EngineCoreRequest:
global _REQUEST_COUNTER
_REQUEST_COUNTER += 1
request_id = f"request-{_REQUEST_COUNTER}"
return EngineCoreRequest( return EngineCoreRequest(
request_id=str(uuid.uuid4()), request_id=request_id,
external_req_id=f"{request_id}-{uuid.uuid4()}",
prompt_token_ids=PROMPT_TOKENS, prompt_token_ids=PROMPT_TOKENS,
mm_features=None, mm_features=None,
sampling_params=SamplingParams(), sampling_params=SamplingParams(),

View File

@ -45,6 +45,8 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
PROMPT = "Hello my name is Robert and I love quantization kernels" PROMPT = "Hello my name is Robert and I love quantization kernels"
PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
_REQUEST_COUNTER = 0
def make_request( def make_request(
params: SamplingParams, prompt_tokens_ids: list[int] | None = None params: SamplingParams, prompt_tokens_ids: list[int] | None = None
@ -52,8 +54,12 @@ def make_request(
if not prompt_tokens_ids: if not prompt_tokens_ids:
prompt_tokens_ids = PROMPT_TOKENS prompt_tokens_ids = PROMPT_TOKENS
global _REQUEST_COUNTER
_REQUEST_COUNTER += 1
request_id = f"request-{_REQUEST_COUNTER}"
return EngineCoreRequest( return EngineCoreRequest(
request_id=str(uuid.uuid4()), request_id=request_id,
external_req_id=f"{request_id}-{uuid.uuid4()}",
prompt_token_ids=prompt_tokens_ids, prompt_token_ids=prompt_tokens_ids,
mm_features=None, mm_features=None,
sampling_params=params, sampling_params=params,

View File

@ -27,6 +27,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
params = SamplingParams(skip_special_tokens=True) params = SamplingParams(skip_special_tokens=True)
request = EngineCoreRequest( request = EngineCoreRequest(
request_id="test", request_id="test",
external_req_id="test-ext",
prompt_token_ids=prompt_token_ids, prompt_token_ids=prompt_token_ids,
mm_features=None, mm_features=None,
sampling_params=params, sampling_params=params,

View File

@ -58,12 +58,12 @@ def test_incremental_detokenization(
output_processor = OutputProcessor( output_processor = OutputProcessor(
dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval
) )
engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)
# Make N requests. # Make N requests.
requests = [ requests = [
EngineCoreRequest( EngineCoreRequest(
request_id=f"request-{idx}", request_id=f"request-{idx}-int",
external_req_id=f"request-{idx}",
prompt_token_ids=prompt_tokens, prompt_token_ids=prompt_tokens,
mm_features=None, mm_features=None,
eos_token_id=None, eos_token_id=None,
@ -83,6 +83,11 @@ def test_incremental_detokenization(
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
] ]
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
request_ids=[req.request_id for req in requests],
)
# Add requests to the detokenizer. # Add requests to the detokenizer.
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
output_processor.add_request(request, prompt) output_processor.add_request(request, prompt)
@ -438,15 +443,6 @@ def test_logprobs_processor(
dummy_test_vectors, dummy_test_vectors,
): ):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False) output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=None
if num_sample_logprobs is None
else dummy_test_vectors.generation_logprobs,
prompt_logprobs_raw=None
if num_prompt_logprobs is None
else dummy_test_vectors.prompt_logprobs,
)
# Make N requests. # Make N requests.
request_id_list = [ request_id_list = [
@ -454,7 +450,8 @@ def test_logprobs_processor(
] ]
requests = [ requests = [
EngineCoreRequest( EngineCoreRequest(
request_id=request_id_list[idx], request_id=request_id_list[idx] + "-int",
external_req_id=request_id_list[idx],
prompt_token_ids=prompt_tokens, prompt_token_ids=prompt_tokens,
mm_features=None, mm_features=None,
eos_token_id=None, eos_token_id=None,
@ -476,6 +473,17 @@ def test_logprobs_processor(
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
] ]
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=None
if num_sample_logprobs is None
else dummy_test_vectors.generation_logprobs,
prompt_logprobs_raw=None
if num_prompt_logprobs is None
else dummy_test_vectors.prompt_logprobs,
request_ids=[req.request_id for req in requests],
)
# Add requests to the detokenizer. # Add requests to the detokenizer.
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
output_processor.add_request(request, prompt) output_processor.add_request(request, prompt)
@ -621,19 +629,12 @@ def test_stop_token(
] ]
prompt_string = dummy_test_vectors.prompt_strings[0] prompt_string = dummy_test_vectors.prompt_strings[0]
prompt_tokens = dummy_test_vectors.prompt_tokens[0] prompt_tokens = dummy_test_vectors.prompt_tokens[0]
engine_core = MockEngineCore(
tokens_list=[generation_tokens],
generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
prompt_logprobs_raw=None,
eos_token_id=eos_token_id,
stop_token_ids=stop_token_ids,
ignore_eos=ignore_eos,
)
# Make request. # Make request.
request_id = "request-0" request_id = "request-0"
request = EngineCoreRequest( request = EngineCoreRequest(
request_id=request_id, request_id=request_id,
external_req_id=request_id + "-ext",
prompt_token_ids=prompt_tokens, prompt_token_ids=prompt_tokens,
mm_features=None, mm_features=None,
eos_token_id=eos_token_id, eos_token_id=eos_token_id,
@ -655,6 +656,16 @@ def test_stop_token(
pooling_params=None, pooling_params=None,
) )
engine_core = MockEngineCore(
tokens_list=[generation_tokens],
generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
prompt_logprobs_raw=None,
eos_token_id=eos_token_id,
stop_token_ids=stop_token_ids,
ignore_eos=ignore_eos,
request_ids=[request.request_id],
)
# Add request to the detokenizer. # Add request to the detokenizer.
output_processor.add_request(request, prompt_string) output_processor.add_request(request, prompt_string)
@ -720,13 +731,6 @@ def test_stop_string(
dummy_test_vectors, dummy_test_vectors,
): ):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False) output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=dummy_test_vectors.generation_logprobs
if num_sample_logprobs
else None,
prompt_logprobs_raw=None,
)
# Make N requests. # Make N requests.
request_id_list = [ request_id_list = [
@ -734,7 +738,8 @@ def test_stop_string(
] ]
requests = [ requests = [
EngineCoreRequest( EngineCoreRequest(
request_id=request_id_list[idx], request_id=request_id_list[idx] + "-int",
external_req_id=request_id_list[idx],
prompt_token_ids=prompt_tokens, prompt_token_ids=prompt_tokens,
mm_features=None, mm_features=None,
eos_token_id=None, eos_token_id=None,
@ -756,6 +761,15 @@ def test_stop_string(
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
] ]
engine_core = MockEngineCore(
tokens_list=dummy_test_vectors.generation_tokens,
generated_logprobs_raw=dummy_test_vectors.generation_logprobs
if num_sample_logprobs
else None,
prompt_logprobs_raw=None,
request_ids=[req.request_id for req in requests],
)
# Add requests to the detokenizer. # Add requests to the detokenizer.
for request, prompt in zip(requests, dummy_test_vectors.prompt_strings): for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
output_processor.add_request(request, prompt) output_processor.add_request(request, prompt)
@ -813,9 +827,12 @@ def test_stop_string(
for idx, (ref_gen_str, stop_str) in enumerate( for idx, (ref_gen_str, stop_str) in enumerate(
zip(dummy_test_vectors.generation_strings, STOP_STRINGS) zip(dummy_test_vectors.generation_strings, STOP_STRINGS)
): ):
# Request should be aborted. # Request should be aborted (check internal ID in abort list).
internal_request_id = f"request-{idx}-int"
assert internal_request_id in aborted
# Use external ID for collecting outputs
request_id = f"request-{idx}" request_id = f"request-{idx}"
assert request_id in aborted
# Collected values that were generated. # Collected values that were generated.
gen_str = gen_strings[request_id] gen_str = gen_strings[request_id]
@ -848,13 +865,13 @@ def test_stop_string(
def test_iteration_stats(dummy_test_vectors): def test_iteration_stats(dummy_test_vectors):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True) output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
engine_core_timestamp = time.monotonic() engine_core_timestamp = time.monotonic()
# Make N requests. # Make N requests.
requests = [ requests = [
EngineCoreRequest( EngineCoreRequest(
request_id=f"request-{idx}", request_id=f"request-{idx}",
external_req_id=f"request-{idx}-ext",
prompt_token_ids=prompt_tokens, prompt_token_ids=prompt_tokens,
mm_features=None, mm_features=None,
eos_token_id=None, eos_token_id=None,
@ -868,6 +885,11 @@ def test_iteration_stats(dummy_test_vectors):
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
] ]
engine_core = MockEngineCore(
dummy_test_vectors.generation_tokens,
request_ids=[req.request_id for req in requests],
)
# Add all requests except one to the OutputProcessor. # Add all requests except one to the OutputProcessor.
num_active = len(dummy_test_vectors.generation_tokens) - 1 num_active = len(dummy_test_vectors.generation_tokens) - 1
for request in requests[:num_active]: for request in requests[:num_active]:
@ -922,7 +944,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
output_processor = OutputProcessor( output_processor = OutputProcessor(
dummy_test_vectors.tokenizer, log_stats=log_stats dummy_test_vectors.tokenizer, log_stats=log_stats
) )
engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
engine_core_timestamp = time.monotonic() engine_core_timestamp = time.monotonic()
# Create LoRA requests # Create LoRA requests
@ -936,7 +957,8 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
lora_assignments = [lora1, lora2, None] lora_assignments = [lora1, lora2, None]
requests = [ requests = [
EngineCoreRequest( EngineCoreRequest(
request_id=f"request-{idx}", request_id=f"request-{idx}-int",
external_req_id=f"request-{idx}",
prompt_token_ids=prompt_tokens, prompt_token_ids=prompt_tokens,
mm_features=None, mm_features=None,
eos_token_id=None, eos_token_id=None,
@ -950,6 +972,11 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
] ]
engine_core = MockEngineCore(
dummy_test_vectors.generation_tokens,
request_ids=[req.request_id for req in requests],
)
# Add all requests to the OutputProcessor # Add all requests to the OutputProcessor
for request in requests: for request in requests:
output_processor.add_request(request, None) output_processor.add_request(request, None)
@ -1015,9 +1042,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
outputs = EngineCoreOutputs( outputs = EngineCoreOutputs(
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats() outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
) )
# Find and mark request-0 as finished (it uses lora-1) # Find and mark request-0-int as finished (it uses lora-1)
for output in outputs.outputs: for output in outputs.outputs:
if output.request_id == "request-0": if output.request_id == "request-0-int":
output.finish_reason = FinishReason.LENGTH output.finish_reason = FinishReason.LENGTH
break break
@ -1040,9 +1067,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
outputs = EngineCoreOutputs( outputs = EngineCoreOutputs(
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats() outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
) )
# Find and mark request-1 as finished (it uses lora-2) # Find and mark request-1-int as finished (it uses lora-2)
for output in outputs.outputs: for output in outputs.outputs:
if output.request_id == "request-1": if output.request_id == "request-1-int":
output.finish_reason = FinishReason.LENGTH output.finish_reason = FinishReason.LENGTH
break break
@ -1064,9 +1091,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
outputs = EngineCoreOutputs( outputs = EngineCoreOutputs(
outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats() outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
) )
# Find and mark request-2 as finished (it has no LoRA) # Find and mark request-2-int as finished (it has no LoRA)
for output in outputs.outputs: for output in outputs.outputs:
if output.request_id == "request-2": if output.request_id == "request-2-int":
output.finish_reason = FinishReason.LENGTH output.finish_reason = FinishReason.LENGTH
break break
@ -1107,7 +1134,9 @@ async def test_request_output_collector():
for idx in range(NUM_REQS) for idx in range(NUM_REQS)
] ]
collector = RequestOutputCollector(RequestOutputKind.DELTA) collector = RequestOutputCollector(
RequestOutputKind.DELTA, request_id="my-request-id-int"
)
# CASE 1: Put then get. # CASE 1: Put then get.
outputs = make_outputs() outputs = make_outputs()
@ -1163,7 +1192,9 @@ async def test_request_output_collector():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_cumulative_output_collector_n(): async def test_cumulative_output_collector_n():
"""Test collector correctly handles multiple outputs by index.""" """Test collector correctly handles multiple outputs by index."""
collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE) collector = RequestOutputCollector(
RequestOutputKind.CUMULATIVE, request_id="my-request-id-int"
)
outputs = [ outputs = [
RequestOutput( RequestOutput(
request_id="my-request-id", request_id="my-request-id",
@ -1242,11 +1273,13 @@ async def test_cumulative_output_collector_n():
@pytest.mark.parametrize("runner", ["generate", "pooling"]) @pytest.mark.parametrize("runner", ["generate", "pooling"])
def test_abort_requests(runner: str, dummy_test_vectors): @pytest.mark.parametrize("abort_by", ["internal", "external"])
def test_abort_requests(runner: str, abort_by: str, dummy_test_vectors):
output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True) output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
requests = [ requests = [
EngineCoreRequest( EngineCoreRequest(
request_id=f"request-{idx}", request_id=f"request-{idx}",
external_req_id=f"external-{idx}",
prompt_token_ids=prompt_tokens, prompt_token_ids=prompt_tokens,
mm_features=None, mm_features=None,
eos_token_id=None, eos_token_id=None,
@ -1265,8 +1298,13 @@ def test_abort_requests(runner: str, dummy_test_vectors):
output_kind = request.sampling_params.output_kind output_kind = request.sampling_params.output_kind
else: else:
output_kind = request.pooling_params.output_kind output_kind = request.pooling_params.output_kind
queue = RequestOutputCollector(output_kind=output_kind) queue = RequestOutputCollector(
output_kind=output_kind, request_id=request.request_id
)
output_processor.add_request(request, None, queue=queue) output_processor.add_request(request, None, queue=queue)
for request in requests: for request in requests:
output_processor.abort_requests([request.request_id]) if abort_by == "internal":
output_processor.abort_requests([request.request_id], internal=True)
else:
output_processor.abort_requests([request.external_req_id], internal=False)

View File

@ -4,11 +4,12 @@
from vllm import SamplingParams from vllm import SamplingParams
from vllm.outputs import CompletionOutput from vllm.outputs import CompletionOutput
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.engine.parallel_sampling import ParentRequest
def test_parent_request_to_output_stream() -> None: def test_parent_request_to_output_stream() -> None:
parent_request = ParentRequest("parent_id", SamplingParams(n=2)) parent_request = ParentRequest(make_request(SamplingParams(n=2)))
parent_request.child_requests = {"child_id_0", "child_id_1"} parent_request.child_requests = {"child_id_0", "child_id_1"}
output_0 = CompletionOutput( output_0 = CompletionOutput(
index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
@ -17,51 +18,31 @@ def test_parent_request_to_output_stream() -> None:
index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
) )
# Request not finished # Request not finished
assert ("parent_id", [output_0], False) == parent_request.get_outputs( assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
"child_id_0", output_0 assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
) assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
assert ("parent_id", [output_1], False) == parent_request.get_outputs( assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
"child_id_1", output_1
)
assert ("parent_id", [output_0], False) == parent_request.get_outputs(
"child_id_0", output_0
)
assert ("parent_id", [output_1], False) == parent_request.get_outputs(
"child_id_1", output_1
)
# output_1 finished # output_1 finished
output_1.finish_reason = "ended" output_1.finish_reason = "ended"
assert ("parent_id", [output_0], False) == parent_request.get_outputs( assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
"child_id_0", output_0 assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
)
assert ("parent_id", [output_1], False) == parent_request.get_outputs(
"child_id_1", output_1
)
# Finished output_1 had already returned, DO NOT returned again # Finished output_1 had already returned, DO NOT returned again
assert ("parent_id", [output_0], False) == parent_request.get_outputs( assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
"child_id_0", output_0 assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
)
assert parent_request.get_outputs("child_id_1", output_1) == (
"parent_id",
[],
False,
)
# output_0 finished # output_0 finished
output_0.finish_reason = "ended" output_0.finish_reason = "ended"
assert ("parent_id", [output_0], True) == parent_request.get_outputs( assert ([output_0], True) == parent_request.get_outputs("child_id_0", output_0)
"child_id_0", output_0 assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
)
assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
# Finished output_0 had already returned, DO NOT returned again # Finished output_0 had already returned, DO NOT returned again
assert parent_request.get_outputs("child_id_0", output_0) == ("parent_id", [], True) assert parent_request.get_outputs("child_id_0", output_0) == ([], True)
assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True) assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
def test_parent_request_to_output_final_only() -> None: def test_parent_request_to_output_final_only() -> None:
parent_request = ParentRequest( parent_request = ParentRequest(
"parent_id", SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY) make_request(SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY))
) )
parent_request.child_requests = {"child_id_0", "child_id_1"} parent_request.child_requests = {"child_id_0", "child_id_1"}
output_0 = CompletionOutput( output_0 = CompletionOutput(
@ -71,33 +52,33 @@ def test_parent_request_to_output_final_only() -> None:
index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
) )
# Request not finished, return nothing # Request not finished, return nothing
assert parent_request.get_outputs("child_id_0", output_0) == ( assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
"parent_id", assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
[],
False,
)
assert parent_request.get_outputs("child_id_1", output_1) == (
"parent_id",
[],
False,
)
# output_1 finished, but outputs won't be returned until all child requests finished # output_1 finished, but outputs won't be returned until all child requests finished
output_1.finish_reason = "ended" output_1.finish_reason = "ended"
assert parent_request.get_outputs("child_id_0", output_0) == ( assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
"parent_id", assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
[],
False,
)
assert parent_request.get_outputs("child_id_1", output_1) == (
"parent_id",
[],
False,
)
# output_0 finished, as all child requests finished, the output would be returned # output_0 finished, as all child requests finished, the output would be returned
output_0.finish_reason = "ended" output_0.finish_reason = "ended"
assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs( assert ([output_0, output_1], True) == parent_request.get_outputs(
"child_id_0", output_0 "child_id_0", output_0
) )
assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs( assert ([output_0, output_1], True) == parent_request.get_outputs(
"child_id_1", output_1 "child_id_1", output_1
) )
def make_request(sampling_params: SamplingParams) -> EngineCoreRequest:
return EngineCoreRequest(
request_id="parent_id",
external_req_id="ext_parent_id",
prompt_token_ids=None,
mm_features=None,
sampling_params=sampling_params,
pooling_params=None,
eos_token_id=None,
arrival_time=0.0,
lora_request=None,
cache_salt=None,
data_parallel_rank=None,
)

View File

@ -5,6 +5,7 @@ import pytest
import torch.cuda import torch.cuda
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core import EngineCore from vllm.v1.engine.core import EngineCore
@ -14,6 +15,11 @@ MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
def test_preprocess_error_handling(monkeypatch: pytest.MonkeyPatch): def test_preprocess_error_handling(monkeypatch: pytest.MonkeyPatch):
"""Test that preprocessing errors are handled gracefully.""" """Test that preprocessing errors are handled gracefully."""
if current_platform.is_rocm():
pytest.skip(
"Skipped on ROCm: this test only works with 'fork', but ROCm uses 'spawn'."
)
assert not torch.cuda.is_initialized(), ( assert not torch.cuda.is_initialized(), (
"fork needs to be used for the engine " "fork needs to be used for the engine "
"core process and this isn't possible if cuda is already initialized" "core process and this isn't possible if cuda is already initialized"

View File

@ -6,6 +6,7 @@ import pytest
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
from vllm.multimodal import MultiModalUUIDDict
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.v1.engine import input_processor as input_processor_mod from vllm.v1.engine import input_processor as input_processor_mod
from vllm.v1.engine.input_processor import InputProcessor from vllm.v1.engine.input_processor import InputProcessor
@ -166,7 +167,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
) )
captured: dict[str, object] = {} captured: dict[str, MultiModalUUIDDict] = {}
def fake_preprocess( def fake_preprocess(
prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
@ -196,7 +197,16 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
) )
# Expect request-id-based overrides are passed through # Expect request-id-based overrides are passed through
assert captured["mm_uuids"] == { mm_uuids = captured["mm_uuids"]
"image": [f"{request_id}-image-0", f"{request_id}-image-1"], assert set(mm_uuids.keys()) == {"image", "video"}
"video": [f"{request_id}-video-0"], assert len(mm_uuids["image"]) == 2
} assert len(mm_uuids["video"]) == 1
assert mm_uuids["image"][0].startswith(f"{request_id}-image-") and mm_uuids[
"image"
][0].endswith("-0")
assert mm_uuids["image"][1].startswith(f"{request_id}-image-") and mm_uuids[
"image"
][1].endswith("-1")
assert mm_uuids["video"][0].startswith(f"{request_id}-video-") and mm_uuids[
"video"
][0].endswith("-0")

View File

@ -343,6 +343,7 @@ class MockEngineCore:
eos_token_id: int | None = None, eos_token_id: int | None = None,
stop_token_ids: list[int] | None = None, stop_token_ids: list[int] | None = None,
ignore_eos: bool = False, ignore_eos: bool = False,
request_ids: list[str] | None = None,
) -> None: ) -> None:
self.num_requests = len(tokens_list) self.num_requests = len(tokens_list)
self.tokens_list = tokens_list self.tokens_list = tokens_list
@ -355,6 +356,11 @@ class MockEngineCore:
self.eos_token_id = eos_token_id self.eos_token_id = eos_token_id
self.stop_token_ids = stop_token_ids self.stop_token_ids = stop_token_ids
self.ignore_eos = ignore_eos self.ignore_eos = ignore_eos
self.request_ids = (
request_ids
if request_ids is not None
else [f"request-{i}" for i in range(self.num_requests)]
)
def get_outputs(self) -> list[EngineCoreOutput]: def get_outputs(self) -> list[EngineCoreOutput]:
do_logprobs = self.do_logprobs do_logprobs = self.do_logprobs
@ -386,7 +392,7 @@ class MockEngineCore:
prompt_logprobs = None prompt_logprobs = None
new_token_id = token_ids[token_idx] new_token_id = token_ids[token_idx]
output = EngineCoreOutput( output = EngineCoreOutput(
request_id=f"request-{req_idx}", request_id=self.request_ids[req_idx],
new_token_ids=[new_token_id], new_token_ids=[new_token_id],
new_logprobs=logprobs, new_logprobs=logprobs,
new_prompt_logprobs_tensors=prompt_logprobs, new_prompt_logprobs_tensors=prompt_logprobs,

View File

@ -8,7 +8,7 @@ import pytest
import pytest_asyncio import pytest_asyncio
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from vllm.multimodal.utils import encode_image_base64 from vllm.multimodal.utils import encode_image_url
# Use a small vision model for testing # Use a small vision model for testing
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct" MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
@ -52,9 +52,9 @@ async def client(image_server):
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]: def url_encoded_image(local_asset_server) -> dict[str, str]:
return { return {
image_url: encode_image_base64(local_asset_server.get_image_asset(image_url)) image_url: encode_image_url(local_asset_server.get_image_asset(image_url))
for image_url in TEST_IMAGE_ASSETS for image_url in TEST_IMAGE_ASSETS
} }
@ -95,7 +95,7 @@ async def test_single_chat_session_image_base64encoded(
client: openai.AsyncOpenAI, client: openai.AsyncOpenAI,
model_name: str, model_name: str,
raw_image_url: str, raw_image_url: str,
base64_encoded_image: dict[str, str], url_encoded_image: dict[str, str],
): ):
content_text = "What's in this image?" content_text = "What's in this image?"
messages = [ messages = [
@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
"content": [ "content": [
{ {
"type": "input_image", "type": "input_image",
"image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", # noqa: E501 "image_url": url_encoded_image[raw_image_url],
"detail": "auto", "detail": "auto",
}, },
{"type": "input_text", "text": content_text}, {"type": "input_text", "text": content_text},

View File

@ -9,7 +9,7 @@ from PIL import Image
from vllm import LLM, EngineArgs, SamplingParams from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.config import KVTransferConfig from vllm.config import KVTransferConfig
from vllm.multimodal.utils import encode_image_base64 from vllm.multimodal.utils import encode_image_url
from vllm.platforms import current_platform from vllm.platforms import current_platform
MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8" MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8"
@ -74,7 +74,7 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
placeholders = [ placeholders = [
{ {
"type": "image_url", "type": "image_url",
"image_url": {"url": f"data:image;base64,{encode_image_base64(image_pil)}"}, "image_url": {"url": encode_image_url(image_pil)},
} }
for image_pil in image_urls for image_pil in image_urls
] ]

View File

@ -41,10 +41,13 @@ from vllm.distributed.kv_transfer.kv_transfer_state import (
has_kv_transfer_group, has_kv_transfer_group,
) )
from vllm.forward_context import ForwardContext from vllm.forward_context import ForwardContext
from vllm.outputs import RequestOutput
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.platforms.interface import Platform from vllm.platforms.interface import Platform
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.output_processor import OutputProcessor
from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
from vllm.v1.request import RequestStatus from vllm.v1.request import RequestStatus
@ -1265,6 +1268,22 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
run_test_and_cleanup() run_test_and_cleanup()
class RequestIdMapper:
"""Helper class to map external request IDs to internal request IDs."""
def __init__(self, output_processor: OutputProcessor):
self.req_id_mapping: dict[str, str] = {}
self.original_add_request = output_processor.add_request
output_processor.add_request = self._add_request
def _add_request(self, request: EngineCoreRequest, *args, **kwargs):
self.req_id_mapping[request.external_req_id] = request.request_id
return self.original_add_request(request, *args, **kwargs)
def __call__(self, external_req_id: str) -> str:
return self.req_id_mapping[external_req_id]
def _run_abort_timeout_test(llm: LLM, timeout: int): def _run_abort_timeout_test(llm: LLM, timeout: int):
"""Helper function to run the abort timeout test logic.""" """Helper function to run the abort timeout test logic."""
remote_prefill_opts = { remote_prefill_opts = {
@ -1286,24 +1305,34 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
0 0
].req_to_blocks ].req_to_blocks
id_mapper = RequestIdMapper(llm.llm_engine.output_processor)
def req_id(outputs: list[RequestOutput]) -> str:
assert len(outputs) == 1
return id_mapper(outputs[0].request_id)
padding = "Just making this request a little longer so that we're sure " padding = "Just making this request a little longer so that we're sure "
"we're not hitting the small-request lower bound beneath which we don't " "we're not hitting the small-request lower bound beneath which we don't "
"actually trigger the whole kv transfer, but rather just recompute the " "actually trigger the whole kv transfer, but rather just recompute the "
"blocks on D." "blocks on D."
_ = llm.generate([f"What is the capital of Japan? {padding}"], sampling_params) req0_id = req_id(
llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
)
# Request finished but not freed # Request finished but not freed
assert "0" in scheduler.finished_req_ids and "0" in req_to_blocks assert req0_id in scheduler.finished_req_ids and req0_id in req_to_blocks
# Some other request, 0 still not freed # Some other request, 0 still not freed
_ = llm.generate([f"What is the capital of Italy? {padding}"], sampling_params) req1_id = req_id(
assert "0" in req_to_blocks llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
assert "1" in scheduler.finished_req_ids and "1" in req_to_blocks )
assert req0_id in req_to_blocks
assert req1_id in scheduler.finished_req_ids and req1_id in req_to_blocks
# Wait for timeout and trigger another scheduler loop # Wait for timeout and trigger another scheduler loop
time.sleep(timeout) time.sleep(timeout)
_ = llm.generate([f"What is the capital of France? {padding}"], sampling_params) _ = llm.generate([f"What is the capital of France? {padding}"], sampling_params)
# Request-0 times out and is cleared! # Request-0 times out and is cleared!
assert "0" not in req_to_blocks assert req0_id not in req_to_blocks
# Need to shutdown the background thread to release NIXL side channel port # Need to shutdown the background thread to release NIXL side channel port
llm.llm_engine.engine_core.shutdown() llm.llm_engine.engine_core.shutdown()

View File

@ -306,10 +306,16 @@ def test_prepare_inputs_padded():
proposer = _create_proposer("eagle", num_speculative_tokens) proposer = _create_proposer("eagle", num_speculative_tokens)
output_metadata, token_indices_to_sample = proposer.prepare_inputs_padded( output_metadata, token_indices_to_sample, num_rejected_tokens_gpu = (
common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count proposer.prepare_inputs_padded(
common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
)
) )
# Verify num_rejected_tokens_gpu is calculated correctly
expected_num_rejected = torch.tensor([1, 0, 2], dtype=torch.int32, device=device)
assert torch.equal(num_rejected_tokens_gpu, expected_num_rejected)
assert output_metadata.max_query_len == 3 assert output_metadata.max_query_len == 3
assert torch.equal(output_metadata.query_start_loc, expected_query_start_loc) assert torch.equal(output_metadata.query_start_loc, expected_query_start_loc)
assert torch.equal(token_indices_to_sample, expected_token_indices_to_sample) assert torch.equal(token_indices_to_sample, expected_token_indices_to_sample)

View File

@ -4,7 +4,7 @@
import openai import openai
import pytest import pytest
from vllm.multimodal.utils import encode_image_base64 from vllm.multimodal.utils import encode_image_url
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS
@ -12,11 +12,9 @@ from ...utils import RemoteOpenAIServer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def base64_encoded_image(local_asset_server) -> dict[str, str]: def url_encoded_image(local_asset_server) -> dict[str, str]:
return { return {
image_asset: encode_image_base64( image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
local_asset_server.get_image_asset(image_asset)
)
for image_asset in TEST_IMAGE_ASSETS for image_asset in TEST_IMAGE_ASSETS
} }
@ -24,19 +22,16 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]:
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU") @pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
@pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"]) @pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"])
async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str]): async def test_basic_vision(model_name: str, url_encoded_image: dict[str, str]):
pytest.skip("Skip this test until it's fixed.") pytest.skip("Skip this test until it's fixed.")
def whats_in_this_image_msg(b64): def whats_in_this_image_msg(url):
return [ return [
{ {
"role": "user", "role": "user",
"content": [ "content": [
{"type": "text", "text": "What's in this image?"}, {"type": "text", "text": "What's in this image?"},
{ {"type": "image_url", "image_url": {"url": url}},
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{b64}"},
},
], ],
} }
] ]
@ -63,14 +58,14 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str
# Other requests now should be much faster # Other requests now should be much faster
for image_url in TEST_IMAGE_ASSETS: for image_url in TEST_IMAGE_ASSETS:
image_base64 = base64_encoded_image[image_url] image_url = url_encoded_image[image_url]
chat_completion_from_base64 = await client.chat.completions.create( chat_completion_from_url = await client.chat.completions.create(
model=model_name, model=model_name,
messages=whats_in_this_image_msg(image_base64), messages=whats_in_this_image_msg(image_url),
max_completion_tokens=24, max_completion_tokens=24,
temperature=0.0, temperature=0.0,
) )
result = chat_completion_from_base64 result = chat_completion_from_url
assert result assert result
choice = result.choices[0] choice = result.choices[0]
assert choice.finish_reason == "length" assert choice.finish_reason == "length"

View File

@ -4,6 +4,7 @@ import functools
from collections.abc import Callable from collections.abc import Callable
import torch import torch
from torch._ops import OpOverload
import vllm.envs as envs import vllm.envs as envs
from vllm.platforms import current_platform from vllm.platforms import current_platform
@ -379,6 +380,31 @@ def _rocm_aiter_gemm_a8w8_fake(
return Y return Y
def _rocm_aiter_triton_gemm_a8w8_blockscale_impl(
A: torch.Tensor,
B: torch.Tensor,
As: torch.Tensor,
Bs: torch.Tensor,
output_dtype: torch.dtype = torch.float16,
) -> torch.Tensor:
from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
def _rocm_aiter_triton_gemm_a8w8_blockscale_fake(
A: torch.Tensor,
B: torch.Tensor,
As: torch.Tensor,
Bs: torch.Tensor,
output_dtype: torch.dtype = torch.float16,
) -> torch.Tensor:
m = A.shape[0]
n = B.shape[0]
Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
return Y
def _rocm_aiter_gemm_a8w8_blockscale_impl( def _rocm_aiter_gemm_a8w8_blockscale_impl(
A: torch.Tensor, A: torch.Tensor,
B: torch.Tensor, B: torch.Tensor,
@ -433,16 +459,16 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_impl(
from aiter import rmsnorm2d_fwd_with_add from aiter import rmsnorm2d_fwd_with_add
residual_out = torch.empty_like(residual) residual_out = torch.empty_like(residual)
output = torch.empty_like(x) out = torch.empty_like(x)
rmsnorm2d_fwd_with_add( rmsnorm2d_fwd_with_add(
output, # output out, # output
x, # input x, # input
residual, # residual input residual, # residual input
residual_out, # residual output residual_out, # residual output
weight, weight,
variance_epsilon, variance_epsilon,
) )
return output, residual_out return out, residual_out
def _rocm_aiter_rmsnorm2d_fwd_with_add_fake( def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
@ -451,7 +477,84 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
weight: torch.Tensor, weight: torch.Tensor,
variance_epsilon: float, variance_epsilon: float,
) -> tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
return torch.empty_like(x), torch.empty_like(residual) residual_out = torch.empty_like(residual)
out = torch.empty_like(x)
return out, residual_out
def _rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl(
x: torch.Tensor,
residual: torch.Tensor,
weight: torch.Tensor,
epsilon: float,
quant_dtype: torch.dtype,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
import aiter as rocm_aiter
assert quant_dtype in [torch.int8, _FP8_DTYPE]
y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
residual_out = torch.empty_like(x)
rocm_aiter.rmsnorm2d_fwd_with_add_dynamicquant(
out,
x,
residual,
residual_out,
y_scale,
weight,
epsilon,
use_model_sensitive_rmsnorm=0,
)
return out, residual_out, y_scale
def _rocm_aiter_rmsnorm_fused_add_dynamic_quant_fake(
x: torch.Tensor,
residual: torch.Tensor,
weight: torch.Tensor,
epsilon: float,
quant_dtype: torch.dtype,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
residual_out = torch.empty_like(x)
return out, residual_out, y_scale
def _rocm_aiter_rmsnorm_fused_dynamic_quant_impl(
x: torch.Tensor,
weight: torch.Tensor,
epsilon: float,
quant_dtype: torch.dtype,
) -> tuple[torch.Tensor, torch.Tensor]:
import aiter as rocm_aiter
assert quant_dtype in [torch.int8, _FP8_DTYPE]
y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
rocm_aiter.rmsnorm2d_fwd_with_dynamicquant(
out, x, y_scale, weight, epsilon, use_model_sensitive_rmsnorm=0
)
return out, y_scale
def _rocm_aiter_rmsnorm_fused_dynamic_quant_fake(
x: torch.Tensor,
weight: torch.Tensor,
epsilon: float,
quant_dtype: torch.dtype,
) -> tuple[torch.Tensor, torch.Tensor]:
y_scale = torch.empty(x.shape[0], 1, dtype=torch.float32, device=x.device)
out = torch.empty(x.shape, dtype=quant_dtype, device=x.device)
return out, y_scale
def _rocm_aiter_per_tensor_quant_impl( def _rocm_aiter_per_tensor_quant_impl(
@ -527,7 +630,11 @@ def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl(
dtype_quant=AITER_FP8_DTYPE, dtype_quant=AITER_FP8_DTYPE,
res1=residual, res1=residual,
) )
return (x_quant, x_quant_scales, res) return (
x_quant,
res,
x_quant_scales,
)
def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake( def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake(
@ -541,8 +648,8 @@ def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake(
scale_shape = (M, (N + group_size - 1) // group_size) scale_shape = (M, (N + group_size - 1) // group_size)
return ( return (
torch.empty_like(x, dtype=AITER_FP8_DTYPE, device=x.device), torch.empty_like(x, dtype=AITER_FP8_DTYPE, device=x.device),
torch.empty(scale_shape, dtype=torch.float32, device=x.device),
torch.empty_like(residual, device=residual.device), torch.empty_like(residual, device=residual.device),
torch.empty(scale_shape, dtype=torch.float32, device=x.device),
) )
@ -882,6 +989,12 @@ class rocm_aiter_ops:
dispatch_key=current_platform.dispatch_key, dispatch_key=current_platform.dispatch_key,
) )
direct_register_custom_op(
op_name="rocm_aiter_triton_gemm_a8w8_blockscale",
op_func=_rocm_aiter_triton_gemm_a8w8_blockscale_impl,
fake_impl=_rocm_aiter_triton_gemm_a8w8_blockscale_fake,
)
direct_register_custom_op( direct_register_custom_op(
op_name="rocm_aiter_gemm_a8w8_blockscale", op_name="rocm_aiter_gemm_a8w8_blockscale",
op_func=_rocm_aiter_gemm_a8w8_blockscale_impl, op_func=_rocm_aiter_gemm_a8w8_blockscale_impl,
@ -901,6 +1014,20 @@ class rocm_aiter_ops:
dispatch_key=current_platform.dispatch_key, dispatch_key=current_platform.dispatch_key,
) )
direct_register_custom_op(
op_name="rocm_aiter_rmsnorm_fused_dynamic_quant",
op_func=_rocm_aiter_rmsnorm_fused_dynamic_quant_impl,
fake_impl=_rocm_aiter_rmsnorm_fused_dynamic_quant_fake,
dispatch_key=current_platform.dispatch_key,
)
direct_register_custom_op(
op_name="rocm_aiter_rmsnorm_fused_add_dynamic_quant",
op_func=_rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl,
fake_impl=_rocm_aiter_rmsnorm_fused_add_dynamic_quant_fake,
dispatch_key=current_platform.dispatch_key,
)
direct_register_custom_op( direct_register_custom_op(
op_name="rocm_aiter_rmsnorm_fp8_group_quant", op_name="rocm_aiter_rmsnorm_fp8_group_quant",
op_func=_rocm_aiter_rmsnorm_fp8_group_quant_impl, op_func=_rocm_aiter_rmsnorm_fp8_group_quant_impl,
@ -936,13 +1063,54 @@ class rocm_aiter_ops:
direct_register_custom_op( direct_register_custom_op(
op_name="rocm_aiter_per_token_quant", op_name="rocm_aiter_per_token_quant",
op_func=_rocm_aiter_per_token_quant_impl, op_func=_rocm_aiter_per_token_quant_impl,
mutates_args=["scale"],
fake_impl=_rocm_aiter_per_token_quant_fake, fake_impl=_rocm_aiter_per_token_quant_fake,
dispatch_key=current_platform.dispatch_key, dispatch_key=current_platform.dispatch_key,
) )
_OPS_REGISTERED = True _OPS_REGISTERED = True
@staticmethod
def get_rmsnorm_fused_add_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default
@staticmethod
def get_rmsnorm_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_rms_norm.default
@staticmethod
def get_rmsnorm_fused_add_dynamic_quant_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_rmsnorm_fused_add_dynamic_quant.default
@staticmethod
def get_rmsnorm_fused_dynamic_quant_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_rmsnorm_fused_dynamic_quant.default
@staticmethod
def get_rmsnorm_group_fused_quant_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant.default
@staticmethod
def get_rmsnorm_group_add_fused_quant_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant.default
@staticmethod
def get_per_token_quant_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_per_token_quant.default
@staticmethod
def get_group_quant_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_group_fp8_quant.default
@staticmethod
def get_act_mul_fused_fp8_group_quant_op() -> OpOverload:
return torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant.default
@staticmethod
def rms_norm(
x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
) -> torch.Tensor:
return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
@staticmethod @staticmethod
def rms_norm2d_with_add( def rms_norm2d_with_add(
x: torch.Tensor, x: torch.Tensor,
@ -954,12 +1122,6 @@ class rocm_aiter_ops:
x, residual, weight, variance_epsilon x, residual, weight, variance_epsilon
) )
@staticmethod
def rms_norm(
x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
) -> torch.Tensor:
return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
@staticmethod @staticmethod
def gemm_a8w8( def gemm_a8w8(
A: torch.Tensor, A: torch.Tensor,
@ -971,6 +1133,19 @@ class rocm_aiter_ops:
) -> torch.Tensor: ) -> torch.Tensor:
return torch.ops.vllm.rocm_aiter_gemm_a8w8(A, B, As, Bs, bias, output_dtype) return torch.ops.vllm.rocm_aiter_gemm_a8w8(A, B, As, Bs, bias, output_dtype)
@staticmethod
def triton_gemm_a8w8_blockscale(
A: torch.Tensor,
B: torch.Tensor,
As: torch.Tensor,
Bs: torch.Tensor,
block_size: list[int],
output_dtype: torch.dtype = torch.float16,
) -> torch.Tensor:
return torch.ops.vllm.rocm_aiter_triton_gemm_a8w8_blockscale(
A, B, As, Bs, output_dtype
)
@staticmethod @staticmethod
def gemm_a8w8_blockscale( def gemm_a8w8_blockscale(
A: torch.Tensor, A: torch.Tensor,
@ -1242,19 +1417,6 @@ class rocm_aiter_ops:
config=config, config=config,
) )
@staticmethod
def triton_gemm_a8w8_blockscale(
A: torch.Tensor,
B: torch.Tensor,
As: torch.Tensor,
Bs: torch.Tensor,
block_size: list[int],
output_dtype: torch.dtype = torch.float16,
) -> torch.Tensor:
from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
@staticmethod @staticmethod
def group_fp8_quant( def group_fp8_quant(
input_2d: torch.Tensor, input_2d: torch.Tensor,

View File

@ -2328,18 +2328,6 @@ def concat_and_cache_mla(
) )
def copy_blocks(
key_caches: list[torch.Tensor],
value_caches: list[torch.Tensor],
block_mapping: torch.Tensor,
) -> None:
torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
def copy_blocks_mla(kv_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None:
torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping)
def swap_blocks( def swap_blocks(
src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor
) -> None: ) -> None:

View File

@ -383,18 +383,6 @@ class ipex_ops:
) )
return None return None
@staticmethod
def copy_blocks(
key_caches: list[torch.Tensor],
value_caches: list[torch.Tensor],
block_mapping: torch.Tensor,
) -> None:
torch.xpu.copy_blocks( # type: ignore
key_caches,
value_caches,
block_mapping,
)
@staticmethod @staticmethod
def swap_blocks( def swap_blocks(
src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor

View File

@ -136,7 +136,7 @@ class MMEncoderAttention(CustomOp):
cu_seqlens=cu_seqlens, cu_seqlens=cu_seqlens,
) )
if is_reshaped: if is_reshaped:
output = output.view(bsz, q_len, -1) output = output.reshape(bsz, q_len, -1)
return output return output
def _forward_fa( def _forward_fa(
@ -174,7 +174,7 @@ class MMEncoderAttention(CustomOp):
fa_version=self._fa_version, fa_version=self._fa_version,
) )
if is_reshaped: if is_reshaped:
output = output.view(bsz, q_len, -1) output = output.reshape(bsz, q_len, -1)
return output return output
def forward_native( def forward_native(

View File

@ -1847,7 +1847,6 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
random_seed=args.seed, random_seed=args.seed,
dataset_path=args.dataset_path, dataset_path=args.dataset_path,
disable_shuffle=args.disable_shuffle, disable_shuffle=args.disable_shuffle,
prefix_len=args.common_prefix_len,
).sample( ).sample(
tokenizer=tokenizer, tokenizer=tokenizer,
num_requests=args.num_prompts, num_requests=args.num_prompts,

View File

@ -1281,12 +1281,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
help="Repetition penalty sampling parameter. Only has effect on " help="Repetition penalty sampling parameter. Only has effect on "
"openai-compatible backends.", "openai-compatible backends.",
) )
sampling_group.add_argument(
"--common-prefix-len",
type=int,
default=None,
help="Common prefix length shared by all prompts (used by random dataset)",
)
parser.add_argument( parser.add_argument(
"--served-model-name", "--served-model-name",

View File

@ -6,11 +6,13 @@ import torch
from torch._higher_order_ops import auto_functionalized from torch._higher_order_ops import auto_functionalized
from torch._ops import OpOverload from torch._ops import OpOverload
from vllm._aiter_ops import rocm_aiter_ops
from vllm.config import get_current_vllm_config from vllm.config import get_current_vllm_config
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape,
QuantKey, QuantKey,
_normalize_quant_group_shape, _normalize_quant_group_shape,
kFp8Dynamic64Sym, kFp8Dynamic64Sym,
@ -150,26 +152,50 @@ class MatcherRotaryEmbedding(MatcherCustomOp):
class MatcherRMSNorm(MatcherCustomOp): class MatcherRMSNorm(MatcherCustomOp):
def __init__(self, epsilon: float, enabled: bool | None = None): def __init__(
self,
epsilon: float,
enabled: bool | None = None,
match_rocm_aiter: bool = False,
):
if enabled is None: if enabled is None:
enabled = RMSNorm.enabled() enabled = RMSNorm.enabled()
super().__init__(enabled) super().__init__(enabled)
self.epsilon = epsilon self.epsilon = epsilon
self._rmsnorm_op = RMS_OP
self.match_rocm_aiter = match_rocm_aiter
if match_rocm_aiter:
self._rmsnorm_op = rocm_aiter_ops.get_rmsnorm_op()
def inputs(self): def inputs(self):
input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16) input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
weight = self.empty(16) weight = self.empty(16)
return [input, weight] return [input, weight]
def forward_rocm_aiter(
self,
input: torch.Tensor,
weight: torch.Tensor,
) -> torch.Tensor:
return self._rmsnorm_op(
x=input,
weight=weight,
variance_epsilon=self.epsilon,
)
def forward_custom( def forward_custom(
self, self,
input: torch.Tensor, input: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
) -> torch.Tensor: ) -> torch.Tensor:
if self.match_rocm_aiter:
return self.forward_rocm_aiter(input, weight)
result = torch.empty_like(input) result = torch.empty_like(input)
_, result = auto_functionalized( _, result = auto_functionalized(
RMS_OP, self._rmsnorm_op,
result=result, result=result,
input=input, input=input,
weight=weight, weight=weight,
@ -189,12 +215,23 @@ class MatcherRMSNorm(MatcherCustomOp):
class MatcherFusedAddRMSNorm(MatcherCustomOp): class MatcherFusedAddRMSNorm(MatcherCustomOp):
def __init__(self, epsilon: float, enabled: bool | None = None): def __init__(
self,
epsilon: float,
enabled: bool | None = None,
match_rocm_aiter: bool = False,
):
if enabled is None: if enabled is None:
enabled = RMSNorm.enabled() enabled = RMSNorm.enabled()
super().__init__(enabled) super().__init__(enabled)
self.epsilon = epsilon self.epsilon = epsilon
self.match_rocm_aiter = match_rocm_aiter
self._rmsnorm_op = RMS_ADD_OP
if match_rocm_aiter:
self._rmsnorm_op = rocm_aiter_ops.get_rmsnorm_fused_add_op()
def inputs(self): def inputs(self):
input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16) input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
@ -202,14 +239,27 @@ class MatcherFusedAddRMSNorm(MatcherCustomOp):
residual = self.empty(5, 16) residual = self.empty(5, 16)
return [input, weight, residual] return [input, weight, residual]
def forward_rocm_aiter(
self,
input: torch.Tensor,
weight: torch.Tensor,
residual: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
return self._rmsnorm_op(
x=input, residual=residual, weight=weight, variance_epsilon=self.epsilon
)
def forward_custom( def forward_custom(
self, self,
input: torch.Tensor, input: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
residual: torch.Tensor, residual: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
if self.match_rocm_aiter:
return self.forward_rocm_aiter(input, weight, residual)
_, result, residual = auto_functionalized( _, result, residual = auto_functionalized(
RMS_ADD_OP, self._rmsnorm_op,
input=input, input=input,
residual=residual, residual=residual,
weight=weight, weight=weight,
@ -236,22 +286,46 @@ class MatcherQuantFP8(MatcherCustomOp):
enabled: bool | None = None, enabled: bool | None = None,
has_col_major_scales: bool = False, has_col_major_scales: bool = False,
is_e8m0: bool = False, is_e8m0: bool = False,
match_rocm_aiter: bool = False,
): ):
if enabled is None: if enabled is None:
enabled = QuantFP8.enabled() enabled = QuantFP8.enabled()
super().__init__(enabled) super().__init__(enabled)
self.quant_key = quant_key self.quant_key = quant_key
assert quant_key in QUANT_OPS, f"unsupported quantization scheme {quant_key}"
self.QUANT_OP = QUANT_OPS[quant_key]
self.has_col_major_scales = has_col_major_scales self.has_col_major_scales = has_col_major_scales
self.is_e8m0 = is_e8m0 self.is_e8m0 = is_e8m0
self.match_rocm_aiter = match_rocm_aiter
if match_rocm_aiter:
assert not quant_key.scale.group_shape.is_per_tensor(), (
"ROCm aiter fusion pass does not support per tensor quantization"
)
if quant_key.scale.group_shape.is_per_token():
self.QUANT_OP = rocm_aiter_ops.get_per_token_quant_op()
else:
assert quant_key.scale.group_shape.col == 128, (
"ROCm aiter fusion pass currently supports "
"quantization operation with group_size 128"
)
if current_platform.is_fp8_fnuz():
self.QUANT_OP = rocm_aiter_ops.get_group_quant_op()
else:
self.QUANT_OP = (
torch.ops.vllm.triton_per_token_group_quant_fp8.default
)
else:
assert quant_key in QUANT_OPS, (
f"unsupported quantization scheme {quant_key}"
)
self.QUANT_OP = QUANT_OPS[quant_key]
assert quant_key.dtype == current_platform.fp8_dtype(), (
"Only QuantFP8 supported by"
)
assert quant_key.scale2 is None
assert quant_key.dtype == current_platform.fp8_dtype(), (
"Only QuantFP8 supported by"
)
assert quant_key.scale2 is None
self.quant_fp8 = QuantFP8( self.quant_fp8 = QuantFP8(
quant_key.scale.static, quant_key.scale.static,
quant_key.scale.group_shape, quant_key.scale.group_shape,
@ -259,11 +333,29 @@ class MatcherQuantFP8(MatcherCustomOp):
use_ue8m0=is_e8m0, use_ue8m0=is_e8m0,
) )
def forward_rocm_aiter(
self,
input: torch.Tensor,
scale: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
quant_key_group_shape = self.quant_key.scale.group_shape
if quant_key_group_shape == GroupShape.PER_TOKEN:
return self.QUANT_OP(
x=input,
quant_dtype=self.quant_key.dtype,
scale=scale,
)
else:
return self.QUANT_OP(input, quant_key_group_shape.col)
def forward_custom( def forward_custom(
self, self,
input: torch.Tensor, input: torch.Tensor,
scale: torch.Tensor | None = None, scale: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
if self.match_rocm_aiter:
return self.forward_rocm_aiter(input, scale)
result = torch.empty( result = torch.empty(
input.shape, device=input.device, dtype=self.quant_key.dtype input.shape, device=input.device, dtype=self.quant_key.dtype
) )

View File

@ -16,7 +16,7 @@ from .vllm_inductor_pass import VllmInductorPass
if rocm_aiter_ops.is_enabled(): if rocm_aiter_ops.is_enabled():
from vllm.compilation.rocm_aiter_fusion import ( from vllm.compilation.rocm_aiter_fusion import (
RocmAiterRMSNormFp8GroupQuantFusionPass, RocmAiterRMSNormFusionPass,
RocmAiterSiluMulFp8GroupQuantFusionPass, RocmAiterSiluMulFp8GroupQuantFusionPass,
) )
@ -117,7 +117,9 @@ class PostGradPassManager(CustomGraphPass):
if self.pass_config.fuse_norm_quant: if self.pass_config.fuse_norm_quant:
self.passes += [RMSNormQuantFusionPass(config)] self.passes += [RMSNormQuantFusionPass(config)]
if rocm_aiter_ops.is_enabled(): if rocm_aiter_ops.is_enabled():
self.passes += [RocmAiterRMSNormFp8GroupQuantFusionPass(config)] self.passes += [
RocmAiterRMSNormFusionPass(config),
]
if self.pass_config.fuse_act_quant: if self.pass_config.fuse_act_quant:
self.passes += [ActivationQuantFusionPass(config)] self.passes += [ActivationQuantFusionPass(config)]
if rocm_aiter_ops.is_enabled(): if rocm_aiter_ops.is_enabled():

View File

@ -9,60 +9,195 @@ from torch._inductor.pattern_matcher import PatternMatcherPass
from torch._ops import OpOverload from torch._ops import OpOverload
import vllm.model_executor.layers.quantization.utils.fp8_utils # noqa: F401 import vllm.model_executor.layers.quantization.utils.fp8_utils # noqa: F401
from vllm._aiter_ops import rocm_aiter_ops
from vllm.compilation.activation_quant_fusion import ActivationQuantPattern from vllm.compilation.activation_quant_fusion import ActivationQuantPattern
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape,
QuantKey,
ScaleDesc,
)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from .fusion import empty_bf16 from .fusion import (
FusedRMSQuantKey,
)
from .inductor_pass import enable_fake_mode from .inductor_pass import enable_fake_mode
from .matcher_utils import MatcherSiluAndMul from .matcher_utils import (
MatcherFusedAddRMSNorm,
MatcherQuantFP8,
MatcherRMSNorm,
MatcherSiluAndMul,
)
from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
logger = init_logger(__name__) logger = init_logger(__name__)
FP8_DTYPE = current_platform.fp8_dtype() FP8_DTYPE = current_platform.fp8_dtype()
AITER_RMS_GROUP_QUANT_OP = torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant.default
AITER_RMS_ADD_GROUP_QUANT_OP = (
torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant.default
)
AITER_RMS_OP = torch.ops.vllm.rocm_aiter_rms_norm.default class AiterRMSNormQuantPattern:
AITER_RMS_ADD_OP = torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default def __init__(
self, epsilon: float, key: FusedRMSQuantKey, match_aiter_quant: bool = True
):
self.epsilon = epsilon
self.quant_dtype = key.quant.dtype
AITER_GROUP_FP8_QUANT_OP = torch.ops.vllm.rocm_aiter_group_fp8_quant.default self.rmsnorm_matcher = (
TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default MatcherRMSNorm(epsilon, match_rocm_aiter=True)
if not key.fused_add
FUSED_SILU_MUL_QUANT_OP = torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant.default else MatcherFusedAddRMSNorm(epsilon, match_rocm_aiter=True)
)
self.quant_matcher = MatcherQuantFP8(
key.quant,
match_rocm_aiter=match_aiter_quant,
)
class AiterRMSFp8GroupQuantPattern: class AiterRMSNormDynamicQuantPattern(AiterRMSNormQuantPattern):
"""AITER RMSNorm + Dynamic Quantization pattern."""
FUSED_OP = rocm_aiter_ops.get_rmsnorm_fused_dynamic_quant_op()
def __init__(
self,
epsilon: float,
quant_dtype: torch.dtype,
match_aiter_quant: bool = True,
group_shape: GroupShape = GroupShape.PER_TOKEN,
symmetric=True,
):
scale = ScaleDesc(torch.float32, False, group_shape)
key = FusedRMSQuantKey(
fused_add=False,
quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
)
super().__init__(epsilon, key, match_aiter_quant)
def register(self, pm_pass):
def pattern(
input: torch.Tensor,
weight: torch.Tensor,
):
result_rms = self.rmsnorm_matcher(input, weight)
result, scale = self.quant_matcher(result_rms)
return result, scale
def replacement(
input: torch.Tensor,
weight: torch.Tensor,
):
result = self.FUSED_OP(
x=input,
weight=weight,
epsilon=self.epsilon,
quant_dtype=self.quant_dtype,
)
return result[0], result[1]
pm.register_replacement(
pattern,
replacement,
self.rmsnorm_matcher.inputs(),
pm.fwd_only,
pm_pass,
)
class AiterFusedAddRMSNormDynamicQuantPattern(AiterRMSNormQuantPattern):
"""AITER RMSNorm Fused Add + Dynamic Quantization pattern."""
FUSED_OP = rocm_aiter_ops.get_rmsnorm_fused_add_dynamic_quant_op()
def __init__(
self,
epsilon: float,
quant_dtype: torch.dtype,
match_aiter_quant: bool = True,
group_shape: GroupShape = GroupShape.PER_TOKEN,
symmetric=True,
):
scale = ScaleDesc(torch.float32, False, group_shape)
key = FusedRMSQuantKey(
fused_add=True,
quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
)
super().__init__(epsilon, key, match_aiter_quant)
def register(self, pm_pass):
def pattern(
input: torch.Tensor,
weight: torch.Tensor,
residual: torch.Tensor,
):
result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual)
result, scale = self.quant_matcher(result_rms)
return result, residual_out, scale
def replacement(
input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor
):
result = self.FUSED_OP(
x=input,
residual=residual,
weight=weight,
epsilon=self.epsilon,
quant_dtype=self.quant_dtype,
)
return result[0], result[1], result[2]
pm.register_replacement(
pattern,
replacement,
self.rmsnorm_matcher.inputs(),
pm.fwd_only,
pm_pass,
)
class AiterRMSFp8GroupQuantPattern(AiterRMSNormQuantPattern):
""" """
This pattern fuses aiter rms_norm & group fp8 quant custom This pattern fuses aiter rms_norm & group fp8 quant custom
ops into an aiter rms_norm_group_fp8_quant op. ops into an aiter rms_norm_group_fp8_quant op.
""" """
def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload): FUSED_OP = rocm_aiter_ops.get_rmsnorm_group_fused_quant_op()
self.epsilon = epsilon
self.quant_dtype = quant_dtype def __init__(
self.quant_op = quant_op self,
epsilon: float,
quant_dtype: torch.dtype,
group_shape: GroupShape,
match_aiter_quant: bool = True,
symmetric=True,
):
scale = ScaleDesc(torch.float32, False, group_shape)
key = FusedRMSQuantKey(
fused_add=False,
quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
)
super().__init__(epsilon, key, match_aiter_quant)
def register(self, pm_pass: PatternMatcherPass): def register(self, pm_pass: PatternMatcherPass):
def pattern( def pattern(
input: torch.Tensor, input: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
): ):
at1 = AITER_RMS_OP(x=input, weight=weight, variance_epsilon=self.epsilon) result_rms = self.rmsnorm_matcher(input, weight)
result, scale = self.quant_matcher(result_rms)
at2 = self.quant_op(at1, 128) return result, scale
return at2[0], at2[1]
def replacement( def replacement(
input: torch.Tensor, input: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
): ):
at = AITER_RMS_GROUP_QUANT_OP( at = self.FUSED_OP(
x=input, x=input,
weight=weight, weight=weight,
variance_epsilon=self.epsilon, variance_epsilon=self.epsilon,
@ -71,49 +206,52 @@ class AiterRMSFp8GroupQuantPattern:
return at[0], at[1] return at[0], at[1]
inputs = [ pm.register_replacement(
empty_bf16(5, 4), # input pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass
empty_bf16(1, 5), # weight )
]
pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
class AiterFusedAddRMSFp8GroupQuantPattern: class AiterFusedAddRMSFp8GroupQuantPattern(AiterRMSNormQuantPattern):
""" """
This pattern fuses aiter rms_norm_with_add & group fp8 quant custom ops This pattern fuses aiter rms_norm_with_add & group fp8 quant custom ops
into a aiter rms_norm_with_add_group_fp8_quant op. into a aiter rms_norm_with_add_group_fp8_quant op.
""" """
def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload): FUSED_OP = rocm_aiter_ops.get_rmsnorm_group_add_fused_quant_op()
self.epsilon = epsilon
self.quant_dtype = quant_dtype def __init__(
self.quant_op = quant_op self,
epsilon: float,
quant_dtype: torch.dtype,
group_shape: GroupShape,
match_aiter_quant: bool = True,
symmetric=True,
):
scale = ScaleDesc(torch.float32, False, group_shape)
key = FusedRMSQuantKey(
fused_add=True,
quant=QuantKey(dtype=quant_dtype, scale=scale, symmetric=symmetric),
)
super().__init__(epsilon, key, match_aiter_quant)
def register(self, pm_pass: PatternMatcherPass): def register(self, pm_pass: PatternMatcherPass):
def pattern( def pattern(
input: torch.Tensor, input: torch.Tensor,
residual: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
residual: torch.Tensor,
): ):
at1 = AITER_RMS_ADD_OP( result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual)
x=input, result, scale = self.quant_matcher(result_rms)
residual=residual,
weight=weight,
variance_epsilon=self.epsilon,
)
at2 = self.quant_op(at1[0], 128) return result, residual_out, scale
# result, scale, residual
return at2[0], at2[1], at1[1]
def replacement( def replacement(
input: torch.Tensor, input: torch.Tensor,
residual: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
residual: torch.Tensor,
): ):
at = AITER_RMS_ADD_GROUP_QUANT_OP( at = self.FUSED_OP(
x=input, x=input,
residual=residual, residual=residual,
weight=weight, weight=weight,
@ -124,18 +262,15 @@ class AiterFusedAddRMSFp8GroupQuantPattern:
# result, scale, residual # result, scale, residual
return at[0], at[1], at[2] return at[0], at[1], at[2]
inputs = [ pm.register_replacement(
empty_bf16(5, 4), # input pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass
empty_bf16(5, 4), # residual )
empty_bf16(1, 5), # weight
]
pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass): class RocmAiterRMSNormFusionPass(VllmPatternMatcherPass):
""" """
This pass fuses rms_norm & quant custom ops into a fused rms_norm_quant op. This pass fuses aiter rms_norm & vllm/aiter quant custom ops
into a fused rms_norm_quant op.
It also supports fused_add_rms_norm. It also supports fused_add_rms_norm.
""" """
@ -144,20 +279,33 @@ class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass):
super().__init__(config) super().__init__(config)
self.patterns: PatternMatcherPass = PatternMatcherPass( self.patterns: PatternMatcherPass = PatternMatcherPass(
pass_name="rocm_aiter_rms_norm_fp8_group_quant_fusion_pass" pass_name="rocm_aiter_rms_norm_quant_fusion_pass"
) )
# Make sure fused add patterns are before simple rms norm, # Make sure fused add patterns are before simple rms norm,
# as the latter is a subset of the former in torch ops # as the latter is a subset of the former in torch ops
for epsilon in [1e-5, 1e-6]: for epsilon in [1e-5, 1e-6]:
# Fuse rms_norm + dynamic group fp8 quant # Fuse aiter rms_norm + aiter dynamic group fp8 quant
for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]: AiterRMSFp8GroupQuantPattern(
AiterRMSFp8GroupQuantPattern(epsilon, FP8_DTYPE, quant_op).register( epsilon, FP8_DTYPE, GroupShape(1, 128)
self.patterns ).register(self.patterns)
)
AiterFusedAddRMSFp8GroupQuantPattern( # Fuse aiter fused_add_rms_norm + aiter dynamic group fp8 quant
epsilon, FP8_DTYPE, quant_op AiterFusedAddRMSFp8GroupQuantPattern(
epsilon, FP8_DTYPE, GroupShape(1, 128)
).register(self.patterns)
for match_aiter_quant in [True, False]:
# Fuse aiter rms_norm + (aiter / vllm built-in)
# dynamic per-token fp8 quant
AiterRMSNormDynamicQuantPattern(
epsilon, FP8_DTYPE, match_aiter_quant=match_aiter_quant
).register(self.patterns)
# Fuse aiter fused_add_rms_norm + (aiter / vllm built-in)
# dynamic per-token fp8 quant
AiterFusedAddRMSNormDynamicQuantPattern(
epsilon, FP8_DTYPE, match_aiter_quant=match_aiter_quant
).register(self.patterns) ).register(self.patterns)
self.dump_patterns(config, self.patterns) self.dump_patterns(config, self.patterns)
@ -169,6 +317,8 @@ class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass):
def uuid(self) -> Any: def uuid(self) -> Any:
fusion_patterns = [ fusion_patterns = [
AiterRMSNormDynamicQuantPattern,
AiterFusedAddRMSNormDynamicQuantPattern,
AiterRMSFp8GroupQuantPattern, AiterRMSFp8GroupQuantPattern,
AiterFusedAddRMSFp8GroupQuantPattern, AiterFusedAddRMSFp8GroupQuantPattern,
] ]
@ -181,6 +331,8 @@ class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern):
ops into an aiter silu_and_mul_group_fp8_quant op. ops into an aiter silu_and_mul_group_fp8_quant op.
""" """
FUSED_SILU_MUL_QUANT_OP = rocm_aiter_ops.get_act_mul_fused_fp8_group_quant_op()
def __init__(self, quant_op: OpOverload): def __init__(self, quant_op: OpOverload):
self.silu_and_mul_matcher = MatcherSiluAndMul() self.silu_and_mul_matcher = MatcherSiluAndMul()
self.quant_op = quant_op self.quant_op = quant_op
@ -196,7 +348,7 @@ class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern):
def replacement( def replacement(
input: torch.Tensor, input: torch.Tensor,
): ):
at = FUSED_SILU_MUL_QUANT_OP(x=input, group_size=128) at = self.FUSED_SILU_MUL_QUANT_OP(x=input, group_size=128)
return at[0], at[1] return at[0], at[1]
inputs = [ inputs = [
@ -216,6 +368,11 @@ class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass):
https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980 https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
""" """
AITER_GROUP_FP8_QUANT_OP = rocm_aiter_ops.get_group_quant_op()
TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default
QUANT_OPS = [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]
@enable_fake_mode @enable_fake_mode
def __init__(self, config: VllmConfig): def __init__(self, config: VllmConfig):
super().__init__(config) super().__init__(config)
@ -224,7 +381,7 @@ class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass):
pass_name="rocm_aiter_silu_mul_fp8_group_quant_fusion_pass" pass_name="rocm_aiter_silu_mul_fp8_group_quant_fusion_pass"
) )
for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]: for quant_op in self.QUANT_OPS:
AiterSiluMulFp8GroupQuantPattern(quant_op).register(self.patterns) AiterSiluMulFp8GroupQuantPattern(quant_op).register(self.patterns)
self.dump_patterns(config, self.patterns) self.dump_patterns(config, self.patterns)

View File

@ -11,7 +11,6 @@ import torch
from pydantic import ConfigDict, Field, field_validator, model_validator from pydantic import ConfigDict, Field, field_validator, model_validator
from pydantic.dataclasses import dataclass from pydantic.dataclasses import dataclass
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
from transformers.configuration_utils import ALLOWED_LAYER_TYPES
import vllm.envs as envs import vllm.envs as envs
from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.backends.registry import AttentionBackendEnum
@ -29,6 +28,7 @@ from vllm.transformers_utils.config import (
get_pooling_config, get_pooling_config,
get_sentence_transformer_tokenizer_config, get_sentence_transformer_tokenizer_config,
is_encoder_decoder, is_encoder_decoder,
is_rope_parameters_nested,
try_get_dense_modules, try_get_dense_modules,
try_get_generation_config, try_get_generation_config,
try_get_safetensors_metadata, try_get_safetensors_metadata,
@ -1094,11 +1094,10 @@ class ModelConfig:
# The size of inputs_embeds is usually identical to the size # The size of inputs_embeds is usually identical to the size
# of the hidden states, however there are exceptions, such as # of the hidden states, however there are exceptions, such as
# embedding models like CLIP and SigLIP # embedding models like CLIP and SigLIP
for target_attr in ("projection_dim", "projection_size"): names = ("projection_dim", "projection_size")
if hasattr(self.hf_text_config, target_attr): return getattr_iter(
return getattr(self.hf_text_config, target_attr) self.hf_text_config, names, default_factory=self.get_hidden_size
)
return self.get_hidden_size()
@property @property
def is_deepseek_mla(self) -> bool: def is_deepseek_mla(self) -> bool:
@ -1231,14 +1230,12 @@ class ModelConfig:
# For ChatGLM: # For ChatGLM:
"multi_query_group_num", "multi_query_group_num",
] ]
for attr in attributes:
num_kv_heads = getattr(self.hf_text_config, attr, None)
if num_kv_heads is not None:
return num_kv_heads
# For non-grouped-query attention models, the number of KV heads is # For non-grouped-query attention models, the number of KV heads is
# equal to the number of attention heads. # equal to the number of attention heads.
return self.hf_text_config.num_attention_heads default_factory = lambda: self.hf_text_config.num_attention_heads
return getattr_iter(
self.hf_text_config, attributes, default_factory=default_factory
)
def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int: def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int:
"""Returns the number of KV heads per GPU.""" """Returns the number of KV heads per GPU."""
@ -1542,6 +1539,10 @@ class ModelConfig:
def is_multimodal_raw_input_only_model(self) -> bool: def is_multimodal_raw_input_only_model(self) -> bool:
return self._model_info.supports_multimodal_raw_input_only return self._model_info.supports_multimodal_raw_input_only
@property
def requires_raw_input_tokens(self) -> bool:
return self._model_info.requires_raw_input_tokens
@property @property
def is_cross_encoder(self) -> bool: def is_cross_encoder(self) -> bool:
return ( return (
@ -2125,9 +2126,7 @@ def _get_and_verify_max_len(
# In Transformers v5 rope_parameters could be TypedDict or dict[str, TypedDict]. # In Transformers v5 rope_parameters could be TypedDict or dict[str, TypedDict].
# To simplify the verification, we convert it to dict[str, TypedDict]. # To simplify the verification, we convert it to dict[str, TypedDict].
rope_parameters = getattr(hf_config, "rope_parameters", None) rope_parameters = getattr(hf_config, "rope_parameters", None)
if rope_parameters and not set(rope_parameters.keys()).issubset( if rope_parameters and not is_rope_parameters_nested(rope_parameters):
ALLOWED_LAYER_TYPES
):
rope_parameters = {"": rope_parameters} rope_parameters = {"": rope_parameters}
# NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE

View File

@ -9,7 +9,7 @@ import inspect
import json import json
import pathlib import pathlib
import textwrap import textwrap
from collections.abc import Iterable, Mapping, Sequence, Set from collections.abc import Callable, Iterable, Mapping, Sequence, Set
from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace
from itertools import pairwise from itertools import pairwise
from typing import TYPE_CHECKING, Any, Protocol, TypeVar from typing import TYPE_CHECKING, Any, Protocol, TypeVar
@ -74,7 +74,11 @@ def get_field(cls: ConfigType, name: str) -> Field:
def getattr_iter( def getattr_iter(
object: object, names: Iterable[str], default: Any, warn: bool = False object: object,
names: Iterable[str],
default: Any | None = None,
default_factory: Callable[[], Any] | None = None,
warn: bool = False,
) -> Any: ) -> Any:
""" """
A helper function that retrieves an attribute from an object which may A helper function that retrieves an attribute from an object which may
@ -96,7 +100,7 @@ def getattr_iter(
names[0], names[0],
) )
return getattr(object, name) return getattr(object, name)
return default return default_factory() if default_factory is not None else default
def contains_object_print(text: str) -> bool: def contains_object_print(text: str) -> bool:

View File

@ -408,7 +408,13 @@ class MooncakeConnectorWorker:
self.engine = TransferEngine() self.engine = TransferEngine()
self.hostname = get_ip() self.hostname = get_ip()
ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", "rdma", "") protocol = self.vllm_config.kv_transfer_config.kv_connector_extra_config.get( # type: ignore[union-attr]
"mooncake_protocol", "rdma"
)
logger.info(
"The Mooncake Transfer Engine is using %s as its protocol.", protocol
)
ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", protocol, "")
if ret_value != 0: if ret_value != 0:
raise RuntimeError("Mooncake Transfer Engine initialization failed.") raise RuntimeError("Mooncake Transfer Engine initialization failed.")

View File

@ -67,6 +67,15 @@ else:
logger = init_logger(__name__) logger = init_logger(__name__)
class ChatTemplateResolutionError(ValueError):
"""Raised when chat template resolution fails.
This is a subclass of ValueError for backward compatibility with
existing exception handlers.
"""
MODALITY_PLACEHOLDERS_MAP = { MODALITY_PLACEHOLDERS_MAP = {
"image": "<##IMAGE##>", "image": "<##IMAGE##>",
"audio": "<##AUDIO##>", "audio": "<##AUDIO##>",
@ -1814,7 +1823,7 @@ def apply_hf_chat_template(
) )
if hf_chat_template is None: if hf_chat_template is None:
raise ValueError( raise ChatTemplateResolutionError(
"As of transformers v4.44, default chat template is no longer " "As of transformers v4.44, default chat template is no longer "
"allowed, so you must provide a chat template if the tokenizer " "allowed, so you must provide a chat template if the tokenizer "
"does not define one." "does not define one."

View File

@ -1280,6 +1280,7 @@ class LLM:
pooling_params: PoolingParams | None = None, pooling_params: PoolingParams | None = None,
lora_request: list[LoRARequest] | LoRARequest | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None,
tokenization_kwargs: dict[str, Any] | None = None, tokenization_kwargs: dict[str, Any] | None = None,
score_template: str | None = None,
) -> list[ScoringRequestOutput]: ) -> list[ScoringRequestOutput]:
model_config = self.model_config model_config = self.model_config
@ -1313,6 +1314,7 @@ class LLM:
data_2=d, data_2=d,
tokenizer=tokenizer, tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
score_template=score_template,
) )
if token_type_ids := engine_prompt.pop("token_type_ids", None): if token_type_ids := engine_prompt.pop("token_type_ids", None):
@ -1347,6 +1349,7 @@ class LLM:
use_tqdm: bool | Callable[..., tqdm] = True, use_tqdm: bool | Callable[..., tqdm] = True,
pooling_params: PoolingParams | None = None, pooling_params: PoolingParams | None = None,
lora_request: list[LoRARequest] | LoRARequest | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None,
chat_template: str | None = None,
) -> list[ScoringRequestOutput]: ) -> list[ScoringRequestOutput]:
"""Generate similarity scores for all pairs `<text,text_pair>` or """Generate similarity scores for all pairs `<text,text_pair>` or
`<multi-modal data, multi-modal data pair>`. `<multi-modal data, multi-modal data pair>`.
@ -1379,6 +1382,8 @@ class LLM:
lora_request: LoRA request to use for generation, if any. lora_request: LoRA request to use for generation, if any.
pooling_params: The pooling parameters for pooling. If None, we pooling_params: The pooling parameters for pooling. If None, we
use the default pooling parameters. use the default pooling parameters.
chat_template: The chat template to use for the scoring. If None, we
use the model's default chat template.
Returns: Returns:
A list of `ScoringRequestOutput` objects containing the A list of `ScoringRequestOutput` objects containing the
generated scores in the same order as the input prompts. generated scores in the same order as the input prompts.
@ -1406,6 +1411,11 @@ class LLM:
): ):
raise ValueError("Score API is only enabled for num_labels == 1.") raise ValueError("Score API is only enabled for num_labels == 1.")
if not model_config.is_cross_encoder and chat_template is not None:
raise ValueError(
"chat_template is only supported for cross-encoder models."
)
# the tokenizer for models such as # the tokenizer for models such as
# "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
# lists of tokens to the `text` and `text_pair` kwargs # lists of tokens to the `text` and `text_pair` kwargs
@ -1475,6 +1485,7 @@ class LLM:
use_tqdm, use_tqdm,
pooling_params, pooling_params,
lora_request, lora_request,
score_template=chat_template,
) )
else: else:
return self._embedding_score( return self._embedding_score(
@ -1610,7 +1621,7 @@ class LLM:
added_request_ids.append(request_id) added_request_ids.append(request_id)
except Exception as e: except Exception as e:
if added_request_ids: if added_request_ids:
self.llm_engine.abort_request(added_request_ids) self.llm_engine.abort_request(added_request_ids, internal=True)
raise e raise e
def _validate_mm_data_and_uuids( def _validate_mm_data_and_uuids(
@ -1720,7 +1731,7 @@ class LLM:
priority=priority, priority=priority,
prompt_text=prompt_text, prompt_text=prompt_text,
) )
return request_id return engine_request.request_id
def _run_engine( def _run_engine(
self, *, use_tqdm: bool | Callable[..., tqdm] = True self, *, use_tqdm: bool | Callable[..., tqdm] = True

View File

@ -909,6 +909,16 @@ def build_app(args: Namespace) -> FastAPI:
@app.exception_handler(RequestValidationError) @app.exception_handler(RequestValidationError)
async def validation_exception_handler(_: Request, exc: RequestValidationError): async def validation_exception_handler(_: Request, exc: RequestValidationError):
from vllm.entrypoints.openai.protocol import VLLMValidationError
param = None
for error in exc.errors():
if "ctx" in error and "error" in error["ctx"]:
ctx_error = error["ctx"]["error"]
if isinstance(ctx_error, VLLMValidationError):
param = ctx_error.parameter
break
exc_str = str(exc) exc_str = str(exc)
errors_str = str(exc.errors()) errors_str = str(exc.errors())
@ -922,6 +932,7 @@ def build_app(args: Namespace) -> FastAPI:
message=message, message=message,
type=HTTPStatus.BAD_REQUEST.phrase, type=HTTPStatus.BAD_REQUEST.phrase,
code=HTTPStatus.BAD_REQUEST, code=HTTPStatus.BAD_REQUEST,
param=param,
) )
) )
return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)
@ -1145,6 +1156,7 @@ async def init_app_state(
engine_client, engine_client,
state.openai_serving_models, state.openai_serving_models,
request_logger=request_logger, request_logger=request_logger,
score_template=resolved_chat_template,
log_error_stack=args.log_error_stack, log_error_stack=args.log_error_stack,
) )
if ("embed" in supported_tasks or "score" in supported_tasks) if ("embed" in supported_tasks or "score" in supported_tasks)

View File

@ -131,6 +131,36 @@ class ErrorResponse(OpenAIBaseModel):
error: ErrorInfo error: ErrorInfo
class VLLMValidationError(ValueError):
"""vLLM-specific validation error for request validation failures.
Args:
message: The error message describing the validation failure.
parameter: Optional parameter name that failed validation.
value: Optional value that was rejected during validation.
"""
def __init__(
self,
message: str,
*,
parameter: str | None = None,
value: Any = None,
) -> None:
super().__init__(message)
self.parameter = parameter
self.value = value
def __str__(self):
base = super().__str__()
extras = []
if self.parameter is not None:
extras.append(f"parameter={self.parameter}")
if self.value is not None:
extras.append(f"value={self.value}")
return f"{base} ({', '.join(extras)})" if extras else base
class ModelPermission(OpenAIBaseModel): class ModelPermission(OpenAIBaseModel):
id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}") id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
object: str = "model_permission" object: str = "model_permission"
@ -466,7 +496,9 @@ class ResponsesRequest(OpenAIBaseModel):
@model_validator(mode="before") @model_validator(mode="before")
def validate_prompt(cls, data): def validate_prompt(cls, data):
if data.get("prompt") is not None: if data.get("prompt") is not None:
raise ValueError("prompt template is not supported") raise VLLMValidationError(
"prompt template is not supported", parameter="prompt"
)
return data return data
@model_validator(mode="before") @model_validator(mode="before")
@ -850,7 +882,10 @@ class ChatCompletionRequest(OpenAIBaseModel):
@classmethod @classmethod
def validate_stream_options(cls, data): def validate_stream_options(cls, data):
if data.get("stream_options") and not data.get("stream"): if data.get("stream_options") and not data.get("stream"):
raise ValueError("Stream options can only be defined when `stream=True`.") raise VLLMValidationError(
"Stream options can only be defined when `stream=True`.",
parameter="stream_options",
)
return data return data
@ -859,19 +894,29 @@ class ChatCompletionRequest(OpenAIBaseModel):
def check_logprobs(cls, data): def check_logprobs(cls, data):
if (prompt_logprobs := data.get("prompt_logprobs")) is not None: if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1): if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
raise ValueError( raise VLLMValidationError(
"`prompt_logprobs` are not available when `stream=True`." "`prompt_logprobs` are not available when `stream=True`.",
parameter="prompt_logprobs",
) )
if prompt_logprobs < 0 and prompt_logprobs != -1: if prompt_logprobs < 0 and prompt_logprobs != -1:
raise ValueError("`prompt_logprobs` must be a positive value or -1.") raise VLLMValidationError(
"`prompt_logprobs` must be a positive value or -1.",
parameter="prompt_logprobs",
value=prompt_logprobs,
)
if (top_logprobs := data.get("top_logprobs")) is not None: if (top_logprobs := data.get("top_logprobs")) is not None:
if top_logprobs < 0 and top_logprobs != -1: if top_logprobs < 0 and top_logprobs != -1:
raise ValueError("`top_logprobs` must be a positive value or -1.") raise VLLMValidationError(
"`top_logprobs` must be a positive value or -1.",
parameter="top_logprobs",
value=top_logprobs,
)
if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"): if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
raise ValueError( raise VLLMValidationError(
"when using `top_logprobs`, `logprobs` must be set to true." "when using `top_logprobs`, `logprobs` must be set to true.",
parameter="top_logprobs",
) )
return data return data
@ -1285,9 +1330,10 @@ class CompletionRequest(OpenAIBaseModel):
for k in ("json", "regex", "choice") for k in ("json", "regex", "choice")
) )
if count > 1: if count > 1:
raise ValueError( raise VLLMValidationError(
"You can only use one kind of constraints for structured " "You can only use one kind of constraints for structured "
"outputs ('json', 'regex' or 'choice')." "outputs ('json', 'regex' or 'choice').",
parameter="structured_outputs",
) )
return data return data
@ -1296,14 +1342,23 @@ class CompletionRequest(OpenAIBaseModel):
def check_logprobs(cls, data): def check_logprobs(cls, data):
if (prompt_logprobs := data.get("prompt_logprobs")) is not None: if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1): if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
raise ValueError( raise VLLMValidationError(
"`prompt_logprobs` are not available when `stream=True`." "`prompt_logprobs` are not available when `stream=True`.",
parameter="prompt_logprobs",
) )
if prompt_logprobs < 0 and prompt_logprobs != -1: if prompt_logprobs < 0 and prompt_logprobs != -1:
raise ValueError("`prompt_logprobs` must be a positive value or -1.") raise VLLMValidationError(
"`prompt_logprobs` must be a positive value or -1.",
parameter="prompt_logprobs",
value=prompt_logprobs,
)
if (logprobs := data.get("logprobs")) is not None and logprobs < 0: if (logprobs := data.get("logprobs")) is not None and logprobs < 0:
raise ValueError("`logprobs` must be a positive value.") raise VLLMValidationError(
"`logprobs` must be a positive value.",
parameter="logprobs",
value=logprobs,
)
return data return data
@ -1311,7 +1366,10 @@ class CompletionRequest(OpenAIBaseModel):
@classmethod @classmethod
def validate_stream_options(cls, data): def validate_stream_options(cls, data):
if data.get("stream_options") and not data.get("stream"): if data.get("stream_options") and not data.get("stream"):
raise ValueError("Stream options can only be defined when `stream=True`.") raise VLLMValidationError(
"Stream options can only be defined when `stream=True`.",
parameter="stream_options",
)
return data return data
@ -2138,7 +2196,15 @@ class TranscriptionRequest(OpenAIBaseModel):
stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"] stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
stream = data.get("stream", False) stream = data.get("stream", False)
if any(bool(data.get(so, False)) for so in stream_opts) and not stream: if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
raise ValueError("Stream options can only be defined when `stream=True`.") # Find which specific stream option was set
invalid_param = next(
(so for so in stream_opts if data.get(so, False)),
"stream_include_usage",
)
raise VLLMValidationError(
"Stream options can only be defined when `stream=True`.",
parameter=invalid_param,
)
return data return data
@ -2351,7 +2417,15 @@ class TranslationRequest(OpenAIBaseModel):
stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"] stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
stream = data.get("stream", False) stream = data.get("stream", False)
if any(bool(data.get(so, False)) for so in stream_opts) and not stream: if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
raise ValueError("Stream options can only be defined when `stream=True`.") # Find which specific stream option was set
invalid_param = next(
(so for so in stream_opts if data.get(so, False)),
"stream_include_usage",
)
raise VLLMValidationError(
"Stream options can only be defined when `stream=True`.",
parameter=invalid_param,
)
return data return data

View File

@ -495,6 +495,7 @@ async def run_batch(
engine_client, engine_client,
openai_serving_models, openai_serving_models,
request_logger=request_logger, request_logger=request_logger,
score_template=None,
) )
if ("embed" in supported_tasks or enable_serving_reranking) if ("embed" in supported_tasks or enable_serving_reranking)
else None else None

View File

@ -417,8 +417,7 @@ class OpenAIServingChat(OpenAIServing):
generators.append(generator) generators.append(generator)
except ValueError as e: except ValueError as e:
# TODO: Use a vllm-specific Validation Error return self.create_error_response(e)
return self.create_error_response(str(e))
assert len(generators) == 1 assert len(generators) == 1
(result_generator,) = generators (result_generator,) = generators
@ -448,8 +447,7 @@ class OpenAIServingChat(OpenAIServing):
except GenerationError as e: except GenerationError as e:
return self._convert_generation_error_to_response(e) return self._convert_generation_error_to_response(e)
except ValueError as e: except ValueError as e:
# TODO: Use a vllm-specific Validation Error return self.create_error_response(e)
return self.create_error_response(str(e))
def get_chat_request_role(self, request: ChatCompletionRequest) -> str: def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
if request.add_generation_prompt: if request.add_generation_prompt:
@ -682,7 +680,7 @@ class OpenAIServingChat(OpenAIServing):
tool_parsers = [None] * num_choices tool_parsers = [None] * num_choices
except Exception as e: except Exception as e:
logger.exception("Error in tool parser creation.") logger.exception("Error in tool parser creation.")
data = self.create_streaming_error_response(str(e)) data = self.create_streaming_error_response(e)
yield f"data: {data}\n\n" yield f"data: {data}\n\n"
yield "data: [DONE]\n\n" yield "data: [DONE]\n\n"
return return
@ -1328,9 +1326,8 @@ class OpenAIServingChat(OpenAIServing):
except GenerationError as e: except GenerationError as e:
yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n" yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
except Exception as e: except Exception as e:
# TODO: Use a vllm-specific Validation Error
logger.exception("Error in chat completion stream generator.") logger.exception("Error in chat completion stream generator.")
data = self.create_streaming_error_response(str(e)) data = self.create_streaming_error_response(e)
yield f"data: {data}\n\n" yield f"data: {data}\n\n"
# Send the final done message after all response.n are finished # Send the final done message after all response.n are finished
yield "data: [DONE]\n\n" yield "data: [DONE]\n\n"
@ -1354,8 +1351,7 @@ class OpenAIServingChat(OpenAIServing):
except asyncio.CancelledError: except asyncio.CancelledError:
return self.create_error_response("Client disconnected") return self.create_error_response("Client disconnected")
except ValueError as e: except ValueError as e:
# TODO: Use a vllm-specific Validation Error return self.create_error_response(e)
return self.create_error_response(str(e))
assert final_res is not None assert final_res is not None

View File

@ -23,6 +23,7 @@ from vllm.entrypoints.openai.protocol import (
PromptTokenUsageInfo, PromptTokenUsageInfo,
RequestResponseMetadata, RequestResponseMetadata,
UsageInfo, UsageInfo,
VLLMValidationError,
) )
from vllm.entrypoints.openai.serving_engine import ( from vllm.entrypoints.openai.serving_engine import (
GenerationError, GenerationError,
@ -247,8 +248,7 @@ class OpenAIServingCompletion(OpenAIServing):
generators.append(generator) generators.append(generator)
except ValueError as e: except ValueError as e:
# TODO: Use a vllm-specific Validation Error return self.create_error_response(e)
return self.create_error_response(str(e))
result_generator = merge_async_iterators(*generators) result_generator = merge_async_iterators(*generators)
@ -308,8 +308,7 @@ class OpenAIServingCompletion(OpenAIServing):
except GenerationError as e: except GenerationError as e:
return self._convert_generation_error_to_response(e) return self._convert_generation_error_to_response(e)
except ValueError as e: except ValueError as e:
# TODO: Use a vllm-specific Validation Error return self.create_error_response(e)
return self.create_error_response(str(e))
# When user requests streaming but we don't stream, we still need to # When user requests streaming but we don't stream, we still need to
# return a streaming response with a single event. # return a streaming response with a single event.
@ -510,9 +509,8 @@ class OpenAIServingCompletion(OpenAIServing):
except GenerationError as e: except GenerationError as e:
yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n" yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
except Exception as e: except Exception as e:
# TODO: Use a vllm-specific Validation Error
logger.exception("Error in completion stream generator.") logger.exception("Error in completion stream generator.")
data = self.create_streaming_error_response(str(e)) data = self.create_streaming_error_response(e)
yield f"data: {data}\n\n" yield f"data: {data}\n\n"
yield "data: [DONE]\n\n" yield "data: [DONE]\n\n"
@ -660,8 +658,11 @@ class OpenAIServingCompletion(OpenAIServing):
token = f"token_id:{token_id}" token = f"token_id:{token_id}"
else: else:
if tokenizer is None: if tokenizer is None:
raise ValueError( raise VLLMValidationError(
"Unable to get tokenizer because `skip_tokenizer_init=True`" "Unable to get tokenizer because "
"`skip_tokenizer_init=True`",
parameter="skip_tokenizer_init",
value=True,
) )
token = tokenizer.decode(token_id) token = tokenizer.decode(token_id)
@ -720,6 +721,15 @@ class OpenAIServingCompletion(OpenAIServing):
request: CompletionRequest, request: CompletionRequest,
max_input_length: int | None = None, max_input_length: int | None = None,
) -> RenderConfig: ) -> RenderConfig:
# Validate max_tokens before using it
if request.max_tokens is not None and request.max_tokens > self.max_model_len:
raise VLLMValidationError(
f"'max_tokens' ({request.max_tokens}) cannot be greater than "
f"the model's maximum context length ({self.max_model_len}).",
parameter="max_tokens",
value=request.max_tokens,
)
max_input_tokens_len = self.max_model_len - (request.max_tokens or 0) max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
return RenderConfig( return RenderConfig(
max_length=max_input_tokens_len, max_length=max_input_tokens_len,

View File

@ -57,6 +57,7 @@ from vllm.entrypoints.openai.protocol import (
TranscriptionRequest, TranscriptionRequest,
TranscriptionResponse, TranscriptionResponse,
TranslationRequest, TranslationRequest,
VLLMValidationError,
) )
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.pooling.classify.protocol import ( from vllm.entrypoints.pooling.classify.protocol import (
@ -322,8 +323,10 @@ class OpenAIServing:
input_processor = self.input_processor input_processor = self.input_processor
tokenizer = input_processor.tokenizer tokenizer = input_processor.tokenizer
if tokenizer is None: if tokenizer is None:
raise ValueError( raise VLLMValidationError(
"You cannot use beam search when `skip_tokenizer_init=True`" "You cannot use beam search when `skip_tokenizer_init=True`",
parameter="skip_tokenizer_init",
value=True,
) )
eos_token_id: int = tokenizer.eos_token_id # type: ignore eos_token_id: int = tokenizer.eos_token_id # type: ignore
@ -706,8 +709,7 @@ class OpenAIServing:
return None return None
except Exception as e: except Exception as e:
# TODO: Use a vllm-specific Validation Error return self.create_error_response(e)
return self.create_error_response(str(e))
async def _collect_batch( async def _collect_batch(
self, self,
@ -738,14 +740,43 @@ class OpenAIServing:
return None return None
except Exception as e: except Exception as e:
return self.create_error_response(str(e)) return self.create_error_response(e)
def create_error_response( def create_error_response(
self, self,
message: str, message: str | Exception,
err_type: str = "BadRequestError", err_type: str = "BadRequestError",
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
param: str | None = None,
) -> ErrorResponse: ) -> ErrorResponse:
exc: Exception | None = None
if isinstance(message, Exception):
exc = message
from vllm.entrypoints.openai.protocol import VLLMValidationError
if isinstance(exc, VLLMValidationError):
err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
param = exc.parameter
elif isinstance(exc, (ValueError, TypeError, RuntimeError)):
# Common validation errors from user input
err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
param = None
elif exc.__class__.__name__ == "TemplateError":
# jinja2.TemplateError (avoid importing jinja2)
err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
param = None
else:
err_type = "InternalServerError"
status_code = HTTPStatus.INTERNAL_SERVER_ERROR
param = None
message = str(exc)
if self.log_error_stack: if self.log_error_stack:
exc_type, _, _ = sys.exc_info() exc_type, _, _ = sys.exc_info()
if exc_type is not None: if exc_type is not None:
@ -753,18 +784,27 @@ class OpenAIServing:
else: else:
traceback.print_stack() traceback.print_stack()
return ErrorResponse( return ErrorResponse(
error=ErrorInfo(message=message, type=err_type, code=status_code.value) error=ErrorInfo(
message=message,
type=err_type,
code=status_code.value,
param=param,
)
) )
def create_streaming_error_response( def create_streaming_error_response(
self, self,
message: str, message: str | Exception,
err_type: str = "BadRequestError", err_type: str = "BadRequestError",
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
param: str | None = None,
) -> str: ) -> str:
json_str = json.dumps( json_str = json.dumps(
self.create_error_response( self.create_error_response(
message=message, err_type=err_type, status_code=status_code message=message,
err_type=err_type,
status_code=status_code,
param=param,
).model_dump() ).model_dump()
) )
return json_str return json_str
@ -825,6 +865,7 @@ class OpenAIServing:
message=f"The model `{request.model}` does not exist.", message=f"The model `{request.model}` does not exist.",
err_type="NotFoundError", err_type="NotFoundError",
status_code=HTTPStatus.NOT_FOUND, status_code=HTTPStatus.NOT_FOUND,
param="model",
) )
def _get_active_default_mm_loras(self, request: AnyRequest) -> LoRARequest | None: def _get_active_default_mm_loras(self, request: AnyRequest) -> LoRARequest | None:
@ -991,11 +1032,13 @@ class OpenAIServing:
ClassificationChatRequest: "classification", ClassificationChatRequest: "classification",
} }
operation = operations.get(type(request), "embedding generation") operation = operations.get(type(request), "embedding generation")
raise ValueError( raise VLLMValidationError(
f"This model's maximum context length is " f"This model's maximum context length is "
f"{self.max_model_len} tokens. However, you requested " f"{self.max_model_len} tokens. However, you requested "
f"{token_num} tokens in the input for {operation}. " f"{token_num} tokens in the input for {operation}. "
f"Please reduce the length of the input." f"Please reduce the length of the input.",
parameter="input_tokens",
value=token_num,
) )
return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids) return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
@ -1017,20 +1060,24 @@ class OpenAIServing:
# Note: input length can be up to model context length - 1 for # Note: input length can be up to model context length - 1 for
# completion-like requests. # completion-like requests.
if token_num >= self.max_model_len: if token_num >= self.max_model_len:
raise ValueError( raise VLLMValidationError(
f"This model's maximum context length is " f"This model's maximum context length is "
f"{self.max_model_len} tokens. However, your request has " f"{self.max_model_len} tokens. However, your request has "
f"{token_num} input tokens. Please reduce the length of " f"{token_num} input tokens. Please reduce the length of "
"the input messages." "the input messages.",
parameter="input_tokens",
value=token_num,
) )
if max_tokens is not None and token_num + max_tokens > self.max_model_len: if max_tokens is not None and token_num + max_tokens > self.max_model_len:
raise ValueError( raise VLLMValidationError(
"'max_tokens' or 'max_completion_tokens' is too large: " "'max_tokens' or 'max_completion_tokens' is too large: "
f"{max_tokens}. This model's maximum context length is " f"{max_tokens}. This model's maximum context length is "
f"{self.max_model_len} tokens and your request has " f"{self.max_model_len} tokens and your request has "
f"{token_num} input tokens ({max_tokens} > {self.max_model_len}" f"{token_num} input tokens ({max_tokens} > {self.max_model_len}"
f" - {token_num})." f" - {token_num}).",
parameter="max_tokens",
value=max_tokens,
) )
return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids) return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)

View File

@ -94,6 +94,7 @@ from vllm.entrypoints.openai.protocol import (
ResponsesResponse, ResponsesResponse,
ResponseUsage, ResponseUsage,
StreamingResponsesResponse, StreamingResponsesResponse,
VLLMValidationError,
) )
from vllm.entrypoints.openai.serving_engine import ( from vllm.entrypoints.openai.serving_engine import (
GenerationError, GenerationError,
@ -271,6 +272,7 @@ class OpenAIServingResponses(OpenAIServing):
err_type="invalid_request_error", err_type="invalid_request_error",
message=error_message, message=error_message,
status_code=HTTPStatus.BAD_REQUEST, status_code=HTTPStatus.BAD_REQUEST,
param="input",
) )
return None return None
@ -282,6 +284,7 @@ class OpenAIServingResponses(OpenAIServing):
err_type="invalid_request_error", err_type="invalid_request_error",
message="logprobs are not supported with gpt-oss models", message="logprobs are not supported with gpt-oss models",
status_code=HTTPStatus.BAD_REQUEST, status_code=HTTPStatus.BAD_REQUEST,
param="logprobs",
) )
if request.store and not self.enable_store and request.background: if request.store and not self.enable_store and request.background:
return self.create_error_response( return self.create_error_response(
@ -294,6 +297,7 @@ class OpenAIServingResponses(OpenAIServing):
"the vLLM server." "the vLLM server."
), ),
status_code=HTTPStatus.BAD_REQUEST, status_code=HTTPStatus.BAD_REQUEST,
param="background",
) )
if request.previous_input_messages and request.previous_response_id: if request.previous_input_messages and request.previous_response_id:
return self.create_error_response( return self.create_error_response(
@ -301,6 +305,7 @@ class OpenAIServingResponses(OpenAIServing):
message="Only one of `previous_input_messages` and " message="Only one of `previous_input_messages` and "
"`previous_response_id` can be set.", "`previous_response_id` can be set.",
status_code=HTTPStatus.BAD_REQUEST, status_code=HTTPStatus.BAD_REQUEST,
param="previous_response_id",
) )
return None return None
@ -457,8 +462,7 @@ class OpenAIServingResponses(OpenAIServing):
) )
generators.append(generator) generators.append(generator)
except ValueError as e: except ValueError as e:
# TODO: Use a vllm-specific Validation Error return self.create_error_response(e)
return self.create_error_response(str(e))
assert len(generators) == 1 assert len(generators) == 1
(result_generator,) = generators (result_generator,) = generators
@ -546,7 +550,7 @@ class OpenAIServingResponses(OpenAIServing):
except GenerationError as e: except GenerationError as e:
return self._convert_generation_error_to_response(e) return self._convert_generation_error_to_response(e)
except Exception as e: except Exception as e:
return self.create_error_response(str(e)) return self.create_error_response(e)
async def _make_request( async def _make_request(
self, self,
@ -630,8 +634,7 @@ class OpenAIServingResponses(OpenAIServing):
except asyncio.CancelledError: except asyncio.CancelledError:
return self.create_error_response("Client disconnected") return self.create_error_response("Client disconnected")
except ValueError as e: except ValueError as e:
# TODO: Use a vllm-specific Validation Error return self.create_error_response(e)
return self.create_error_response(str(e))
# NOTE: Implementation of stauts is still WIP, but for now # NOTE: Implementation of stauts is still WIP, but for now
# we guarantee that if the status is not "completed", it is accurate. # we guarantee that if the status is not "completed", it is accurate.
@ -1074,7 +1077,7 @@ class OpenAIServingResponses(OpenAIServing):
response = self._convert_generation_error_to_response(e) response = self._convert_generation_error_to_response(e)
except Exception as e: except Exception as e:
logger.exception("Background request failed for %s", request.request_id) logger.exception("Background request failed for %s", request.request_id)
response = self.create_error_response(str(e)) response = self.create_error_response(e)
finally: finally:
new_event_signal.set() new_event_signal.set()
@ -1099,7 +1102,7 @@ class OpenAIServingResponses(OpenAIServing):
response = self._convert_generation_error_to_response(e) response = self._convert_generation_error_to_response(e)
except Exception as e: except Exception as e:
logger.exception("Background request failed for %s", request.request_id) logger.exception("Background request failed for %s", request.request_id)
response = self.create_error_response(str(e)) response = self.create_error_response(e)
if isinstance(response, ErrorResponse): if isinstance(response, ErrorResponse):
# If the request has failed, update the status to "failed". # If the request has failed, update the status to "failed".
@ -1116,7 +1119,11 @@ class OpenAIServingResponses(OpenAIServing):
starting_after: int | None = None, starting_after: int | None = None,
) -> AsyncGenerator[StreamingResponsesResponse, None]: ) -> AsyncGenerator[StreamingResponsesResponse, None]:
if response_id not in self.event_store: if response_id not in self.event_store:
raise ValueError(f"Unknown response_id: {response_id}") raise VLLMValidationError(
f"Unknown response_id: {response_id}",
parameter="response_id",
value=response_id,
)
event_deque, new_event_signal = self.event_store[response_id] event_deque, new_event_signal = self.event_store[response_id]
start_index = 0 if starting_after is None else starting_after + 1 start_index = 0 if starting_after is None else starting_after + 1
@ -1172,6 +1179,7 @@ class OpenAIServingResponses(OpenAIServing):
return self.create_error_response( return self.create_error_response(
err_type="invalid_request_error", err_type="invalid_request_error",
message="Cannot cancel a synchronous response.", message="Cannot cancel a synchronous response.",
param="response_id",
) )
# Update the status to "cancelled". # Update the status to "cancelled".
@ -1191,6 +1199,7 @@ class OpenAIServingResponses(OpenAIServing):
err_type="invalid_request_error", err_type="invalid_request_error",
message=f"Response with id '{response_id}' not found.", message=f"Response with id '{response_id}' not found.",
status_code=HTTPStatus.NOT_FOUND, status_code=HTTPStatus.NOT_FOUND,
param="response_id",
) )
def _make_store_not_supported_error(self) -> ErrorResponse: def _make_store_not_supported_error(self) -> ErrorResponse:
@ -1203,6 +1212,7 @@ class OpenAIServingResponses(OpenAIServing):
"starting the vLLM server." "starting the vLLM server."
), ),
status_code=HTTPStatus.BAD_REQUEST, status_code=HTTPStatus.BAD_REQUEST,
param="store",
) )
async def _process_simple_streaming_events( async def _process_simple_streaming_events(

View File

@ -30,6 +30,7 @@ from vllm.entrypoints.openai.protocol import (
TranslationSegment, TranslationSegment,
TranslationStreamResponse, TranslationStreamResponse,
UsageInfo, UsageInfo,
VLLMValidationError,
) )
from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
@ -259,7 +260,11 @@ class OpenAISpeechToText(OpenAIServing):
) )
if len(audio_data) / 1024**2 > self.max_audio_filesize_mb: if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
raise ValueError("Maximum file size exceeded.") raise VLLMValidationError(
"Maximum file size exceeded",
parameter="audio_filesize_mb",
value=len(audio_data) / 1024**2,
)
with io.BytesIO(audio_data) as bytes_: with io.BytesIO(audio_data) as bytes_:
# NOTE resample to model SR here for efficiency. This is also a # NOTE resample to model SR here for efficiency. This is also a
@ -287,12 +292,18 @@ class OpenAISpeechToText(OpenAIServing):
) )
if request.response_format == "verbose_json": if request.response_format == "verbose_json":
if not isinstance(prompt, dict): if not isinstance(prompt, dict):
raise ValueError(f"Expected prompt to be a dict,got {type(prompt)}") raise VLLMValidationError(
"Expected prompt to be a dict",
parameter="prompt",
value=type(prompt).__name__,
)
prompt_dict = cast(dict, prompt) prompt_dict = cast(dict, prompt)
decoder_prompt = prompt.get("decoder_prompt") decoder_prompt = prompt.get("decoder_prompt")
if not isinstance(decoder_prompt, str): if not isinstance(decoder_prompt, str):
raise ValueError( raise VLLMValidationError(
f"Expected decoder_prompt to bestr, got {type(decoder_prompt)}" "Expected decoder_prompt to be str",
parameter="decoder_prompt",
value=type(decoder_prompt).__name__,
) )
prompt_dict["decoder_prompt"] = decoder_prompt.replace( prompt_dict["decoder_prompt"] = decoder_prompt.replace(
"<|notimestamps|>", "<|0.00|>" "<|notimestamps|>", "<|0.00|>"
@ -412,7 +423,7 @@ class OpenAISpeechToText(OpenAIServing):
except ValueError as e: except ValueError as e:
logger.exception("Error in preprocessing prompt inputs") logger.exception("Error in preprocessing prompt inputs")
return self.create_error_response(str(e)) return self.create_error_response(e)
list_result_generator: list[AsyncGenerator[RequestOutput, None]] | None = None list_result_generator: list[AsyncGenerator[RequestOutput, None]] | None = None
try: try:
@ -448,8 +459,7 @@ class OpenAISpeechToText(OpenAIServing):
for i, prompt in enumerate(prompts) for i, prompt in enumerate(prompts)
] ]
except ValueError as e: except ValueError as e:
# TODO: Use a vllm-specific Validation Error return self.create_error_response(e)
return self.create_error_response(str(e))
if request.stream: if request.stream:
return stream_generator_method( return stream_generator_method(
@ -523,8 +533,7 @@ class OpenAISpeechToText(OpenAIServing):
except asyncio.CancelledError: except asyncio.CancelledError:
return self.create_error_response("Client disconnected") return self.create_error_response("Client disconnected")
except ValueError as e: except ValueError as e:
# TODO: Use a vllm-specific Validation Error return self.create_error_response(e)
return self.create_error_response(str(e))
async def _speech_to_text_stream_generator( async def _speech_to_text_stream_generator(
self, self,
@ -634,9 +643,8 @@ class OpenAISpeechToText(OpenAIServing):
) )
except Exception as e: except Exception as e:
# TODO: Use a vllm-specific Validation Error
logger.exception("Error in %s stream generator.", self.task_type) logger.exception("Error in %s stream generator.", self.task_type)
data = self.create_streaming_error_response(str(e)) data = self.create_streaming_error_response(e)
yield f"data: {data}\n\n" yield f"data: {data}\n\n"
# Send the final done message after all response.n are finished # Send the final done message after all response.n are finished
yield "data: [DONE]\n\n" yield "data: [DONE]\n\n"

View File

@ -52,6 +52,7 @@ class ServingScores(OpenAIServing):
models: OpenAIServingModels, models: OpenAIServingModels,
*, *,
request_logger: RequestLogger | None, request_logger: RequestLogger | None,
score_template: str | None = None,
log_error_stack: bool = False, log_error_stack: bool = False,
) -> None: ) -> None:
super().__init__( super().__init__(
@ -60,6 +61,7 @@ class ServingScores(OpenAIServing):
request_logger=request_logger, request_logger=request_logger,
log_error_stack=log_error_stack, log_error_stack=log_error_stack,
) )
self.score_template = score_template
async def _embedding_score( async def _embedding_score(
self, self,
@ -169,6 +171,7 @@ class ServingScores(OpenAIServing):
data_2=data_2, data_2=data_2,
tokenizer=tokenizer, tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
score_template=self.score_template,
) )
self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt) self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt)
if request.mm_processor_kwargs is not None: if request.mm_processor_kwargs is not None:

View File

@ -12,6 +12,7 @@ import torch
from pydantic import Field from pydantic import Field
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.entrypoints.openai.protocol import VLLMValidationError
from vllm.inputs.data import EmbedsPrompt, TextPrompt, TokensPrompt from vllm.inputs.data import EmbedsPrompt, TextPrompt, TokensPrompt
from vllm.inputs.parse import get_prompt_components, parse_raw_prompts from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
from vllm.tokenizers import TokenizerLike from vllm.tokenizers import TokenizerLike
@ -162,8 +163,9 @@ class BaseRenderer(ABC):
) -> list[EmbedsPrompt]: ) -> list[EmbedsPrompt]:
"""Load and validate base64-encoded embeddings into prompt objects.""" """Load and validate base64-encoded embeddings into prompt objects."""
if not self.model_config.enable_prompt_embeds: if not self.model_config.enable_prompt_embeds:
raise ValueError( raise VLLMValidationError(
"You must set `--enable-prompt-embeds` to input `prompt_embeds`." "You must set `--enable-prompt-embeds` to input `prompt_embeds`.",
parameter="prompt_embeds",
) )
def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt: def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
@ -396,10 +398,12 @@ class CompletionRenderer(BaseRenderer):
) -> TokensPrompt: ) -> TokensPrompt:
"""Create validated TokensPrompt.""" """Create validated TokensPrompt."""
if max_length is not None and len(token_ids) > max_length: if max_length is not None and len(token_ids) > max_length:
raise ValueError( raise VLLMValidationError(
f"This model's maximum context length is {max_length} tokens. " f"This model's maximum context length is {max_length} tokens. "
f"However, your request has {len(token_ids)} input tokens. " f"However, your request has {len(token_ids)} input tokens. "
"Please reduce the length of the input messages." "Please reduce the length of the input messages.",
parameter="input_tokens",
value=len(token_ids),
) )
tokens_prompt = TokensPrompt(prompt_token_ids=token_ids) tokens_prompt = TokensPrompt(prompt_token_ids=token_ids)

View File

@ -11,9 +11,11 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageEmbedsParam, ChatCompletionContentPartImageEmbedsParam,
ChatCompletionContentPartImageParam, ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam, ChatCompletionContentPartTextParam,
ChatTemplateResolutionError,
MultiModalItemTracker, MultiModalItemTracker,
_ContentPart, _ContentPart,
_parse_chat_message_content_part, _parse_chat_message_content_part,
apply_hf_chat_template,
) )
from vllm.inputs import TokensPrompt from vllm.inputs import TokensPrompt
from vllm.model_executor.models.interfaces import supports_score_template from vllm.model_executor.models.interfaces import supports_score_template
@ -139,10 +141,8 @@ def _parse_score_content(
return next(iter(mm_placeholder_storage.values()))[0] return next(iter(mm_placeholder_storage.values()))[0]
def apply_score_template( def _apply_model_score_template(
model_config: ModelConfig, model_config: ModelConfig, prompt_1: str, prompt_2: str
prompt_1: str,
prompt_2: str,
) -> str: ) -> str:
# NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf) # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf)
from vllm.model_executor.model_loader import get_model_cls from vllm.model_executor.model_loader import get_model_cls
@ -181,6 +181,7 @@ def get_score_prompt(
tokenization_kwargs: dict[str, Any], tokenization_kwargs: dict[str, Any],
data_1: str | ScoreContentPartParam, data_1: str | ScoreContentPartParam,
data_2: str | ScoreContentPartParam, data_2: str | ScoreContentPartParam,
score_template: str | None = None,
) -> tuple[str, TokensPrompt]: ) -> tuple[str, TokensPrompt]:
prompt_1, prompt_2, mm_data = parse_score_data( prompt_1, prompt_2, mm_data = parse_score_data(
data_1, data_1,
@ -190,19 +191,48 @@ def get_score_prompt(
from vllm.model_executor.model_loader import get_model_cls from vllm.model_executor.model_loader import get_model_cls
model = get_model_cls(model_config) model = get_model_cls(model_config)
if supports_score_template(model):
full_prompt = apply_score_template(model_config, prompt_1, prompt_2) def default_tokenizer_encode():
prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) if supports_score_template(model):
elif model_config.use_pad_token: full_prompt = _apply_model_score_template(model_config, prompt_1, prompt_2)
# cross_encoder models defaults to using pad_token. prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
prompt_inputs = tokenizer( else:
text=prompt_1, text_pair=prompt_2, **tokenization_kwargs if model_config.use_pad_token:
) # cross_encoder models defaults to using pad_token.
full_prompt = tokenizer.decode(prompt_inputs["input_ids"]) prompt_inputs = tokenizer(
text=prompt_1, text_pair=prompt_2, **tokenization_kwargs
)
full_prompt = tokenizer.decode(prompt_inputs["input_ids"])
else:
# `llm as reranker` models defaults to not using pad_token.
full_prompt = prompt_1 + prompt_2
prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs)
return full_prompt, prompt_inputs
# FIXME: For now, we only apply a template when one is explicitly provided.
# We cannot rely on the tokenizer's chat template because many models
# inherit junk templates from their base LLM, which breaks both the models
# and the tests that use them.
if score_template is None:
full_prompt, prompt_inputs = default_tokenizer_encode()
else: else:
# `llm as reranker` models defaults to not using pad_token. # FIXME: Try applying a score template from the CLI arg or tokenizer_config.json
full_prompt = prompt_1 + prompt_2 # If that fails because there is no such template,
prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs) # fall back to the default implementation.
try:
full_prompt = apply_hf_chat_template(
tokenizer,
[
{"role": "query", "content": prompt_1},
{"role": "document", "content": prompt_2},
],
score_template,
tools=None,
model_config=model_config,
)
prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
except ChatTemplateResolutionError:
full_prompt, prompt_inputs = default_tokenizer_encode()
engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"]) engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"])

View File

@ -186,6 +186,7 @@ class DPMetadata:
class ForwardContext: class ForwardContext:
# copy from vllm_config.compilation_config.static_forward_context # copy from vllm_config.compilation_config.static_forward_context
no_compile_layers: dict[str, Any] no_compile_layers: dict[str, Any]
attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
""" """
Type Dict[str, AttentionMetadata] for v1, map from layer_name of each Type Dict[str, AttentionMetadata] for v1, map from layer_name of each
attention layer to its attention metadata attention layer to its attention metadata
@ -193,7 +194,6 @@ class ForwardContext:
for each microbatch. for each microbatch.
Set dynamically for each forward pass Set dynamically for each forward pass
""" """
attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
# TODO: remove after making all virtual_engines share the same kv cache # TODO: remove after making all virtual_engines share the same kv cache
virtual_engine: int # set dynamically for each forward pass virtual_engine: int # set dynamically for each forward pass
# set dynamically for each forward pass # set dynamically for each forward pass

View File

@ -12,7 +12,6 @@ from vllm.lora.peft_helper import PEFTHelper
from vllm.lora.utils import ( from vllm.lora.utils import (
get_lora_id, get_lora_id,
is_base_embeddding_weights, is_base_embeddding_weights,
is_regex_target_modules,
parse_fine_tuned_lora_name, parse_fine_tuned_lora_name,
) )
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
@ -201,37 +200,13 @@ class LoRAModel:
for module in f.keys(): # noqa for module in f.keys(): # noqa
tensors[module] = f.get_tensor(module) tensors[module] = f.get_tensor(module)
elif os.path.isfile(lora_bin_file_path) or os.path.isfile(lora_pt_file_path): elif os.path.isfile(lora_bin_file_path) or os.path.isfile(lora_pt_file_path):
# When a bin/pt file is provided, we rely on config to find
# unexpected modules.
unexpected_modules = []
target_modules = peft_helper.target_modules
if not isinstance(target_modules, list):
target_modules = [target_modules]
for module in target_modules:
# Compatible with more modules,
# such as:layers.11.self_attn.k_proj
part_name = module.split(".")[-1]
if part_name not in expected_lora_modules:
unexpected_modules.append(module)
# loaded lora's target modules must be a subset of
# expected_lora_modules. It is not reliable. See
# https://github.com/vllm-project/vllm/pull/5909. But there's no
# other better mechanism.
if unexpected_modules and not is_regex_target_modules(
peft_helper.target_modules, expected_lora_modules
):
raise ValueError(
f"While loading {lora_dir}, expected"
f" target modules in {expected_lora_modules}"
f" but received {unexpected_modules}."
f" Please verify that the loaded LoRA module is correct"
)
lora_file_path = ( lora_file_path = (
lora_bin_file_path lora_bin_file_path
if os.path.isfile(lora_bin_file_path) if os.path.isfile(lora_bin_file_path)
else lora_pt_file_path else lora_pt_file_path
) )
tensors = torch.load(lora_file_path, map_location=device, weights_only=True) tensors = torch.load(lora_file_path, map_location=device, weights_only=True)
check_unexpected_modules(tensors)
else: else:
raise ValueError(f"{lora_dir} doesn't contain tensors") raise ValueError(f"{lora_dir} doesn't contain tensors")

View File

@ -11,9 +11,11 @@ import torch
from vllm import envs from vllm import envs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
from vllm.platforms import current_platform from vllm.platforms import current_platform
logger = init_logger(__name__) logger = init_logger(__name__)
is_batch_invariant = vllm_is_batch_invariant()
_LORA_A_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {} _LORA_A_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {}
_LORA_B_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {} _LORA_B_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {}
@ -150,7 +152,8 @@ def _get_lora_b_ptr(
@functools.lru_cache @functools.lru_cache
def load_lora_op_config(op_type: str, add_inputs: bool | None) -> dict | None: def load_lora_op_config(op_type: str, add_inputs: bool | None) -> dict | None:
user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER
if user_defined_config_folder is not None: # Avoid optimizing for the batch invariant case. Use default config
if user_defined_config_folder is not None and not is_batch_invariant:
gpu_name = torch.cuda.get_device_name() gpu_name = torch.cuda.get_device_name()
gpu_name = gpu_name.replace(" ", "_") gpu_name = gpu_name.replace(" ", "_")
gpu_name = gpu_name.replace("-", "_") gpu_name = gpu_name.replace("-", "_")
@ -203,11 +206,14 @@ def get_lora_op_configs(
# default config # default config
default = {} default = {}
if op_type == "shrink": if op_type == "shrink":
split_k = 64 if batch < 128 else 8
if is_batch_invariant:
split_k = 1
default = { default = {
"block_m": 32, "block_m": 32,
"block_n": 16, "block_n": 16,
"block_k": 256 if batch < 128 else 32, "block_k": 256 if batch < 128 else 32,
"split_k": 64 if batch < 128 else 8, "split_k": split_k,
"num_warps": 4, "num_warps": 4,
"num_ctas": 1, "num_ctas": 1,
"group_size_m": 8, "group_size_m": 8,

View File

@ -5,7 +5,6 @@ import os
from typing import TYPE_CHECKING, Optional from typing import TYPE_CHECKING, Optional
import huggingface_hub import huggingface_hub
import regex as re
from huggingface_hub.utils import ( from huggingface_hub.utils import (
EntryNotFoundError, EntryNotFoundError,
HfHubHTTPError, HfHubHTTPError,
@ -186,39 +185,6 @@ def is_base_embeddding_weights(name: str) -> bool:
return name.endswith(embedding_suffixes) return name.endswith(embedding_suffixes)
def is_regex_target_modules(
load_modules: str | list[str], expected_lora_modules: set[str]
) -> bool:
"""
PEFT supports passing `target_modules` in the form of regular expressions,
such as `model.*(q_proj|k_proj|v_proj)$`. This function is mainly used to
determine whether the suffix in the regular expression is present in the
`expected_lora_modules`.
"""
def is_valid_regex(pattern):
try:
re.compile(pattern)
return True
except re.error:
return False
def is_subset(sub_list, full_set):
return set(sub_list).issubset(full_set)
# Similar to PEFT's processing logic, regex-related operations are only
# executed when the load_modules is a `str`.
if not isinstance(load_modules, str):
return False
if is_valid_regex(load_modules):
match = re.search(r"\((.*?)\)\$?$", load_modules)
if match:
suffix = match.group(1).split("|")
return is_subset(suffix, expected_lora_modules)
return False
def get_supported_lora_modules(model: nn.Module) -> list[str]: def get_supported_lora_modules(model: nn.Module) -> list[str]:
""" """
In vLLM, all linear layers support LoRA. In vLLM, all linear layers support LoRA.

View File

@ -2132,6 +2132,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
torch.float16, torch.float16,
torch.bfloat16, torch.bfloat16,
torch.float8_e4m3fn, torch.float8_e4m3fn,
torch.float8_e4m3fnuz,
] ]
E, num_tokens, N, K, top_k_num = self.moe_problem_size( E, num_tokens, N, K, top_k_num = self.moe_problem_size(
@ -2156,7 +2157,10 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
compute_type = tl.float16 compute_type = tl.float16
elif hidden_states.dtype == torch.float32: elif hidden_states.dtype == torch.float32:
compute_type = tl.float32 compute_type = tl.float32
elif hidden_states.dtype == torch.float8_e4m3fn: elif (
hidden_states.dtype == torch.float8_e4m3fn
or hidden_states.dtype == torch.float8_e4m3fnuz
):
compute_type = tl.bfloat16 compute_type = tl.bfloat16
else: else:
raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}") raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")

View File

@ -13,6 +13,10 @@ from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
def __init__(self, defer_input_quant: bool = False) -> None:
super().__init__()
self.defer_input_quant = defer_input_quant
@property @property
def activation_format(self) -> mk.FusedMoEActivationFormat: def activation_format(self) -> mk.FusedMoEActivationFormat:
return mk.FusedMoEActivationFormat.Standard return mk.FusedMoEActivationFormat.Standard
@ -48,6 +52,11 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
# Note: do not use inplace for shared experts overlap # Note: do not use inplace for shared experts overlap
a1 = a1 * topk_weights.to(a1.dtype) a1 = a1 * topk_weights.to(a1.dtype)
# Defer input quant to moe kernel for backends (e.g. AITER, FI)
# which use a single kernel call for quant + experts.
if self.defer_input_quant:
return a1, None, None, None, None
a1q, a1q_scale = moe_kernel_quantize_input( a1q, a1q_scale = moe_kernel_quantize_input(
a1, a1,
quant_config.a1_scale, quant_config.a1_scale,

View File

@ -5,11 +5,15 @@ from functools import lru_cache
import torch import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm._aiter_ops import rocm_aiter_ops from vllm._aiter_ops import rocm_aiter_ops
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
FUSED_MOE_UNQUANTIZED_CONFIG, FUSED_MOE_UNQUANTIZED_CONFIG,
FusedMoEQuantConfig, FusedMoEQuantConfig,
) )
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceNoOP,
)
class QuantMethod(IntEnum): class QuantMethod(IntEnum):
@ -263,3 +267,78 @@ def rocm_aiter_fused_experts(
a2_scale=quant_config.a2_scale, a2_scale=quant_config.a2_scale,
doweight_stage1=apply_router_weight_on_input, doweight_stage1=apply_router_weight_on_input,
) )
class AiterExperts(mk.FusedMoEPermuteExpertsUnpermute):
def __init__(self, quant_config):
super().__init__(quant_config)
@property
def activation_formats(
self,
) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
return (
mk.FusedMoEActivationFormat.Standard,
mk.FusedMoEActivationFormat.Standard,
)
def supports_expert_map(self):
return True
def supports_chunking(self):
return False
def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
return TopKWeightAndReduceNoOP()
def workspace_shapes(
self,
M: int,
N: int,
K: int,
topk: int,
global_num_experts: int,
local_num_experts: int,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
# Workspaces are managed internally by AITER.
workspace1 = (0,)
workspace2 = (0,)
output = (M, K)
return (workspace1, workspace2, output)
def apply(
self,
output: torch.Tensor,
hidden_states: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
activation: str,
global_num_experts: int,
expert_map: torch.Tensor | None,
a1q_scale: torch.Tensor | None,
a2_scale: torch.Tensor | None,
workspace13: torch.Tensor,
workspace2: torch.Tensor,
expert_tokens_meta: mk.ExpertTokensMetadata | None,
apply_router_weight_on_input: bool,
):
assert a1q_scale is None
assert a2_scale is None
assert expert_tokens_meta is None
result = rocm_aiter_fused_experts(
hidden_states=hidden_states,
w1=w1,
w2=w2,
topk_weights=topk_weights,
topk_ids=topk_ids,
activation=activation,
apply_router_weight_on_input=apply_router_weight_on_input,
expert_map=expert_map,
quant_config=self.quant_config,
)
assert result.shape == output.shape
output.copy_(result)

View File

@ -6,6 +6,7 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
import vllm.envs as envs import vllm.envs as envs
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm._aiter_ops import rocm_aiter_ops from vllm._aiter_ops import rocm_aiter_ops
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.custom_op import CustomOp
@ -23,6 +24,9 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEPermuteExpertsUnpermute, FusedMoEPermuteExpertsUnpermute,
FusedMoEPrepareAndFinalize, FusedMoEPrepareAndFinalize,
) )
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoEP,
)
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.platforms.interface import CpuArchEnum from vllm.platforms.interface import CpuArchEnum
@ -30,9 +34,9 @@ from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
if current_platform.is_cuda_alike(): if current_platform.is_cuda_alike():
from .fused_batched_moe import BatchedTritonExperts from .fused_batched_moe import BatchedTritonExperts
from .fused_moe import TritonExperts, fused_experts from .fused_moe import TritonExperts
else: else:
fused_experts = None # type: ignore TritonExperts = None # type: ignore
if current_platform.is_tpu(): if current_platform.is_tpu():
from .moe_pallas import fused_moe as fused_moe_pallas from .moe_pallas import fused_moe as fused_moe_pallas
@ -265,6 +269,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer) layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
else: else:
layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer) layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
elif current_platform.is_cuda_alike():
self.moe_quant_config = self.get_fused_moe_quant_config(layer)
self.kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
TritonExperts(self.moe_quant_config),
shared_experts=None,
)
def apply( def apply(
self, self,
@ -278,9 +289,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
router_logits=router_logits, router_logits=router_logits,
) )
def get_fused_moe_quant_config( def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig:
self, layer: torch.nn.Module
) -> FusedMoEQuantConfig | None:
if self.moe.has_bias: if self.moe.has_bias:
return biased_moe_quant_config( return biased_moe_quant_config(
layer.w13_bias, layer.w13_bias,
@ -322,7 +331,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
apply_router_weight_on_input=layer.apply_router_weight_on_input, apply_router_weight_on_input=layer.apply_router_weight_on_input,
) )
else: else:
result = fused_experts( result = self.kernel(
hidden_states=x, hidden_states=x,
w1=layer.w13_weight, w1=layer.w13_weight,
w2=layer.w2_weight, w2=layer.w2_weight,
@ -330,7 +339,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
topk_ids=topk_ids, topk_ids=topk_ids,
inplace=True, inplace=True,
activation=layer.activation, activation=layer.activation,
quant_config=self.moe_quant_config,
apply_router_weight_on_input=layer.apply_router_weight_on_input, apply_router_weight_on_input=layer.apply_router_weight_on_input,
global_num_experts=layer.global_num_experts, global_num_experts=layer.global_num_experts,
expert_map=layer.expert_map, expert_map=layer.expert_map,

View File

@ -118,6 +118,7 @@ class ShortConv(MambaBase, CustomOp):
conv_state = self_kv_cache[0].transpose(-1, -2) conv_state = self_kv_cache[0].transpose(-1, -2)
state_indices_tensor = attn_metadata.state_indices_tensor state_indices_tensor = attn_metadata.state_indices_tensor
has_initial_states_p = attn_metadata.has_initial_states_p has_initial_states_p = attn_metadata.has_initial_states_p
query_start_loc_p = attn_metadata.query_start_loc_p
BCx, _ = self.in_proj(hidden_states) BCx, _ = self.in_proj(hidden_states)
@ -165,11 +166,6 @@ class ShortConv(MambaBase, CustomOp):
[num_decodes, num_prefills], [num_decodes, num_prefills],
dim=0, dim=0,
) )
query_start_loc_p = (
attn_metadata.query_start_loc[-num_prefills - 1 :] - num_decodes
if has_prefill
else None
)
conv_output_list = [] conv_output_list = []

View File

@ -117,6 +117,7 @@ class Fp8MoeBackend(Enum):
DEEPGEMM = 3 DEEPGEMM = 3
MARLIN = 4 MARLIN = 4
TRITON = 5 TRITON = 5
AITER = 6
def get_fp8_moe_backend( def get_fp8_moe_backend(
@ -189,6 +190,10 @@ def get_fp8_moe_backend(
logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local") logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local")
return Fp8MoeBackend.DEEPGEMM return Fp8MoeBackend.DEEPGEMM
if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MOE:
logger.info_once("Using ROCm AITER backend for FP8 MoE", scope="local")
return Fp8MoeBackend.AITER
# default to Triton # default to Triton
logger.info_once("Using Triton backend for FP8 MoE") logger.info_once("Using Triton backend for FP8 MoE")
return Fp8MoeBackend.TRITON return Fp8MoeBackend.TRITON
@ -888,16 +893,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
layer.w13_input_scale = None layer.w13_input_scale = None
layer.w2_input_scale = None layer.w2_input_scale = None
self.rocm_aiter_moe_enabled = False
def process_weights_after_loading(self, layer: Module) -> None: def process_weights_after_loading(self, layer: Module) -> None:
if getattr(layer, "_already_called_process_weights_after_loading", False): if getattr(layer, "_already_called_process_weights_after_loading", False):
return return
# Lazy import to avoid importing triton too early.
self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
# TODO (rob): refactor block quant into separate class. # TODO (rob): refactor block quant into separate class.
if self.block_quant: if self.block_quant:
assert self.quant_config.activation_scheme == "dynamic" assert self.quant_config.activation_scheme == "dynamic"
@ -932,7 +931,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
replace_parameter(layer, "w13_weight_scale_inv", w13_weight_scale_inv) replace_parameter(layer, "w13_weight_scale_inv", w13_weight_scale_inv)
replace_parameter(layer, "w2_weight", w2_weight) replace_parameter(layer, "w2_weight", w2_weight)
replace_parameter(layer, "w2_weight_scale_inv", w2_weight_scale_inv) replace_parameter(layer, "w2_weight_scale_inv", w2_weight_scale_inv)
if self.rocm_aiter_moe_enabled: if self.fp8_backend == Fp8MoeBackend.AITER:
# reshaping weights is required for aiter moe kernel. # reshaping weights is required for aiter moe kernel.
shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
layer.w13_weight.data, layer.w2_weight.data layer.w13_weight.data, layer.w2_weight.data
@ -1026,7 +1025,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
) )
start += shard_size start += shard_size
if self.rocm_aiter_moe_enabled: if self.fp8_backend == Fp8MoeBackend.AITER:
shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
layer.w13_weight, layer.w2_weight layer.w13_weight, layer.w2_weight
) )
@ -1072,6 +1071,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
self.moe_quant_config = config self.moe_quant_config = config
self.kernel = mk.FusedMoEModularKernel( self.kernel = mk.FusedMoEModularKernel(
# TODO(rob): we can use the generic MoEPrepareAndFinalizeNoEP
# with the changes to defer input quantization
FlashInferAllGatherMoEPrepareAndFinalize( FlashInferAllGatherMoEPrepareAndFinalize(
use_dp=(self.moe.dp_size > 1), use_dp=(self.moe.dp_size > 1),
use_deepseek_fp8_block_scale=self.block_quant, use_deepseek_fp8_block_scale=self.block_quant,
@ -1093,6 +1094,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
Fp8MoeBackend.DEEPGEMM, Fp8MoeBackend.DEEPGEMM,
Fp8MoeBackend.TRITON, Fp8MoeBackend.TRITON,
Fp8MoeBackend.MARLIN, Fp8MoeBackend.MARLIN,
Fp8MoeBackend.AITER,
]: ]:
from vllm.model_executor.layers.fused_moe import ( from vllm.model_executor.layers.fused_moe import (
TritonOrDeepGemmExperts, TritonOrDeepGemmExperts,
@ -1103,24 +1105,33 @@ class Fp8MoEMethod(FusedMoEMethodBase):
from vllm.model_executor.layers.fused_moe.prepare_finalize import ( from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoEP, MoEPrepareAndFinalizeNoEP,
) )
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
AiterExperts,
)
config = self.get_fused_moe_quant_config(layer) config = self.get_fused_moe_quant_config(layer)
assert config is not None assert config is not None
self.moe_quant_config = config self.moe_quant_config = config
use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
moe_kernel = (
MarlinExperts(quant_config=self.moe_quant_config)
if use_marlin
else TritonOrDeepGemmExperts(
quant_config=self.moe_quant_config,
allow_deep_gemm=allow_deep_gemm,
)
)
self.kernel = mk.FusedMoEModularKernel( if self.fp8_backend == Fp8MoeBackend.AITER:
MoEPrepareAndFinalizeNoEP(), moe_kernel self.kernel = mk.FusedMoEModularKernel(
) # TODO: make defer_input_quant an attr of the AiterExperts
MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
AiterExperts(quant_config=self.moe_quant_config),
)
elif self.fp8_backend == Fp8MoeBackend.MARLIN:
self.kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
MarlinExperts(quant_config=self.moe_quant_config),
)
else:
self.kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
TritonOrDeepGemmExperts(
quant_config=self.moe_quant_config,
allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM),
),
)
self.use_inplace = True self.use_inplace = True
def maybe_make_prepare_finalize( def maybe_make_prepare_finalize(
@ -1128,7 +1139,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
) -> mk.FusedMoEPrepareAndFinalize | None: ) -> mk.FusedMoEPrepareAndFinalize | None:
if ( if (
self.rocm_aiter_moe_enabled self.fp8_backend == Fp8MoeBackend.AITER
or self.fp8_backend == Fp8MoeBackend.MARLIN or self.fp8_backend == Fp8MoeBackend.MARLIN
or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
): ):
@ -1161,11 +1172,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
TritonOrDeepGemmExperts, TritonOrDeepGemmExperts,
) )
assert ( if self.fp8_backend in [Fp8MoeBackend.MARLIN, Fp8MoeBackend.AITER]:
self.fp8_backend != Fp8MoeBackend.MARLIN raise NotImplementedError(
) and not self.rocm_aiter_moe_enabled, ( "Marlin and ROCm AITER are not supported with all2all yet."
"Marlin and ROCm AITER are not supported with all2all yet." )
)
assert self.moe_quant_config is not None assert self.moe_quant_config is not None
@ -1313,37 +1323,18 @@ class Fp8MoEMethod(FusedMoEMethodBase):
hidden_states=x, hidden_states=x,
router_logits=router_logits, router_logits=router_logits,
) )
result = self.kernel(
if self.rocm_aiter_moe_enabled: x,
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 layer.w13_weight,
rocm_aiter_fused_experts, layer.w2_weight,
) topk_weights,
topk_ids,
# TODO(rob): convert this to MK. inplace=self.use_inplace,
result = rocm_aiter_fused_experts( activation=layer.activation,
x, global_num_experts=layer.global_num_experts,
layer.w13_weight, expert_map=layer.expert_map,
layer.w2_weight, apply_router_weight_on_input=layer.apply_router_weight_on_input,
topk_weights=topk_weights, )
topk_ids=topk_ids,
activation=layer.activation,
apply_router_weight_on_input=layer.apply_router_weight_on_input,
expert_map=layer.expert_map,
quant_config=self.moe_quant_config,
)
else:
result = self.kernel(
x,
layer.w13_weight,
layer.w2_weight,
topk_weights,
topk_ids,
inplace=self.use_inplace,
activation=layer.activation,
global_num_experts=layer.global_num_experts,
expert_map=layer.expert_map,
apply_router_weight_on_input=layer.apply_router_weight_on_input,
)
return result return result
@ -1456,15 +1447,10 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
layer.w13_input_scale = None layer.w13_input_scale = None
layer.w2_input_scale = None layer.w2_input_scale = None
self.rocm_aiter_moe_enabled = False
def process_weights_after_loading(self, layer: Module) -> None: def process_weights_after_loading(self, layer: Module) -> None:
if getattr(layer, "_already_called_process_weights_after_loading", False): if getattr(layer, "_already_called_process_weights_after_loading", False):
return return
# Lazy import to avoid importing triton too early.
self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
# If checkpoint is fp16, quantize in place. # If checkpoint is fp16, quantize in place.
fp8_dtype = current_platform.fp8_dtype() fp8_dtype = current_platform.fp8_dtype()
w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype) w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
@ -1481,7 +1467,7 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
replace_parameter(layer, "w2_weight", w2_weight) replace_parameter(layer, "w2_weight", w2_weight)
# Reshuffle weights for AITER if needed. # Reshuffle weights for AITER if needed.
if self.rocm_aiter_moe_enabled: if self.fp8_backend == Fp8MoeBackend.AITER:
shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights(
layer.w13_weight, layer.w2_weight layer.w13_weight, layer.w2_weight
) )
@ -1489,7 +1475,7 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
replace_parameter(layer, "w2_weight", shuffled_w2) replace_parameter(layer, "w2_weight", shuffled_w2)
# Rushuffle weights for MARLIN if needed. # Rushuffle weights for MARLIN if needed.
if self.fp8_backend == Fp8MoeBackend.MARLIN: elif self.fp8_backend == Fp8MoeBackend.MARLIN:
prepare_moe_fp8_layer_for_marlin( prepare_moe_fp8_layer_for_marlin(
layer, False, input_dtype=self.marlin_input_dtype layer, False, input_dtype=self.marlin_input_dtype
) )

Some files were not shown because too many files have changed in this diff Show More