Merge branch 'main' into rename_file_info_to_pkg/file

This commit is contained in:
Ning Xie 2025-11-28 10:09:08 +08:00 committed by GitHub
commit f851177b97
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
419 changed files with 12213 additions and 4609 deletions

View File

@ -7,53 +7,51 @@ set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-0-16}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
NUMA_NODE=${NUMA_NODE:-0}
export CMAKE_BUILD_PARALLEL_LEVEL=32
export CMAKE_BUILD_PARALLEL_LEVEL=16
# Setup cleanup
remove_docker_container() {
set -e;
docker rm -f cpu-test-"$NUMA_NODE" || true;
docker rm -f cpu-test || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
# Run the image
docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
function cpu_tests() {
set -e
export NUMA_NODE=$2
docker exec cpu-test-"$NUMA_NODE" bash -c "
docker exec cpu-test bash -c "
set -e
pip list"
# offline inference
docker exec cpu-test-"$NUMA_NODE" bash -c "
docker exec cpu-test bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run kernel tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
docker exec cpu-test bash -c "
set -e
pytest -x -v -s tests/kernels/test_onednn.py
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
# basic online serving
docker exec cpu-test-"$NUMA_NODE" bash -c '
docker exec cpu-test bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--model Qwen/Qwen3-0.6B \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
@ -61,4 +59,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
timeout 2h bash -c cpu_tests

View File

@ -1,10 +1,12 @@
#!/usr/bin/env bash
set -euxo pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
THRESHOLD=${1:-0.8}
NUM_Q=${2:-1319}
PORT=${3:-8020}
DATA_PARALLEL_SIZE=${4:-2}
TENSOR_PARALLEL_SIZE=${5:-2}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
@ -45,8 +47,10 @@ for BACK in "${BACKENDS[@]}"; do
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
--tensor-parallel-size 2 \
--data-parallel-size 2 \
--enable-eplb \
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
--data-parallel-size ${DATA_PARALLEL_SIZE} \
--enable-expert-parallel \
--trust-remote-code \
--max-model-len 2048 \

View File

@ -1486,4 +1486,4 @@ steps:
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020

View File

@ -192,6 +192,7 @@ steps:
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@ -631,6 +632,7 @@ steps:
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min
@ -902,11 +904,12 @@ steps:
- label: Transformers Nightly Models Test
working_dir: "/vllm-workspace/"
optional: true
soft_fail: true
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
- pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py
# - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
@ -1116,6 +1119,7 @@ steps:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
@ -1340,11 +1344,20 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes: 60
gpu: b200
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

View File

@ -13,7 +13,7 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
- name: Set up Python
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0

View File

@ -105,6 +105,31 @@ jobs:
}
],
},
cpu: {
// Keyword search - matches whole words only (with word boundaries)
keywords: [
{
term: "CPU Backend",
searchIn: "title"
},
{
term: "x86",
searchIn: "title"
},
{
term: "ARM",
searchIn: "title"
},
{
term: "Apple Silicon",
searchIn: "title"
},
{
term: "IBM Z",
searchIn: "title"
},
],
},
// Add more label configurations here as needed
// example: {
// keywords: [...],

View File

@ -12,7 +12,7 @@ jobs:
timeout-minutes: 30
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v6
- uses: astral-sh/setup-uv@v7
with:

View File

@ -16,7 +16,7 @@ jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with:
python-version: "3.12"

View File

@ -136,7 +136,7 @@ elseif(HIP_FOUND)
# ROCm 5.X and 6.X
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
"expected for ROCm build, saw ${Torch_VERSION} instead.")
endif()
@ -604,12 +604,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else()
message(STATUS "Not building NVFP4 as no compatible archs were found.")

View File

@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
*Latest News* 🔥
- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).

View File

@ -2,7 +2,7 @@
This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
Currently this just includes dense GEMMs and only works on Hopper GPUs.
Currently, this just includes dense GEMMs and only works on Hopper GPUs.
## Setup

View File

@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel(
scalar_t* output, float* output_lse, const scalar_t* prefix_output,
const float* prefix_lse, const scalar_t* suffix_output,
const float* suffix_lse, const uint num_tokens, const uint num_heads,
const uint head_size) {
const uint head_size, const uint prefix_head_stride,
const uint output_head_stride) {
using pack_128b_t = uint4;
const uint pack_size = 16 / sizeof(scalar_t);
const uint threads_per_head = head_size / pack_size;
@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel(
const uint head_idx = token_head_idx % num_heads;
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
const uint head_offset =
token_idx * num_heads * head_size + head_idx * head_size;
const scalar_t* prefix_head_ptr = prefix_output + head_offset;
const scalar_t* suffix_head_ptr = suffix_output + head_offset;
scalar_t* output_head_ptr = output + head_offset;
const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
head_idx * prefix_head_stride;
const uint dst_head_offset = token_idx * num_heads * output_head_stride +
head_idx * output_head_stride;
const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
scalar_t* output_head_ptr = output + dst_head_offset;
float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel(
reinterpret_cast<float*>(prefix_lse.data_ptr()), \
reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \
reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \
num_heads, head_size); \
num_heads, head_size, prefix_head_stride, output_head_stride); \
}
/*@brief Merges the attention states from prefix and suffix
@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output,
const uint num_tokens = output.size(0);
const uint num_heads = output.size(1);
const uint head_size = output.size(2);
const uint prefix_head_stride = prefix_output.stride(1);
const uint output_head_stride = output.stride(1);
const uint pack_size = 16 / sizeof(scalar_t);
TORCH_CHECK(head_size % pack_size == 0,
"headsize must be multiple of pack_size:", pack_size);
TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
"output heads must be contiguous in memory");
TORCH_CHECK(
prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
"prefix_output heads must be contiguous in memory");
TORCH_CHECK(
suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
"suffix_output heads must be contiguous in memory");
float* output_lse_ptr = nullptr;
if (output_lse.has_value()) {
output_lse_ptr = output_lse.value().data_ptr<float>();

View File

@ -41,11 +41,12 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
const double scale, const std::string& kv_cache_dtype);
void gather_and_maybe_dequant_cache(
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
int64_t batch_size, const std::string& kv_cache_dtype,
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
int64_t num_tokens, const std::string& kv_cache_dtype,
torch::Tensor const& scale,
std::optional<torch::Tensor> seq_starts = std::nullopt);

View File

@ -905,91 +905,79 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
namespace vllm {
// grid is launched with dimensions (batch, num_splits)
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt,
int ENTRY_SIZE, int CTA_SIZE>
__global__ void gather_and_maybe_dequant_cache(
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
// ENTRIES...]
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
const int32_t block_size, const int32_t entry_size,
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
// ENTRIES...]
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
const int32_t* __restrict__ token_to_seq, // [MAX_TOKEN_ACROSS_CHUNK]
const int32_t num_tokens, const int32_t block_size,
const int64_t block_table_stride, const int64_t cache_block_stride,
const int64_t cache_entry_stride, const int64_t dst_entry_stride,
const float* __restrict__ scale,
const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per
// batch
constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
using ltype = vllm::vec_n_t<cache_t, vec_size>;
using stype = vllm::vec_n_t<scalar_t, vec_size>;
// We are adding this for code readability which will be optimized out when
// build in release.
assert(CTA_SIZE == blockDim.x);
const int64_t bid = blockIdx.x; // Batch ID
const int32_t num_splits = gridDim.y;
const int32_t split = blockIdx.y;
const int32_t seq_start = cu_seq_lens[bid];
const int32_t seq_end = cu_seq_lens[bid + 1];
const int32_t seq_len = seq_end - seq_start;
const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
#pragma unroll
for (int token_id = blockIdx.x; token_id < num_tokens;
token_id += gridDim.x) {
int64_t batch_id = token_to_seq[token_id];
int64_t batch_start = cu_seq_lens[batch_id];
int64_t batch_end = cu_seq_lens[batch_id + 1];
int32_t batch_offset = token_id - batch_start;
const int32_t split_start = split * split_blocks;
const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
if (token_id >= batch_end) return;
int32_t offset = 0;
if (seq_starts != nullptr) {
offset = seq_starts[batch_id];
}
batch_offset += offset;
int32_t block_table_id = batch_offset / block_size;
int32_t slot_id = batch_offset % block_size;
int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
int32_t block_id = block_table[block_table_offset];
int64_t cache_offset =
block_id * cache_block_stride + slot_id * cache_entry_stride;
constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
scalar_t* dst_ = dst + token_id * dst_entry_stride;
cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;
const bool is_active_split = (split_start < tot_blocks);
const bool is_last_split = (split_end == tot_blocks);
if (!is_active_split) return;
int32_t full_blocks_end = split_end;
int32_t partial_block_size = 0;
// Adjust the pointer for the block_table for this batch.
// If seq_starts is provided, compute an offset based on (seq_starts[bid] /
// page_size)
const int32_t batch_offset = bid * block_table_stride;
int32_t offset = 0;
if (seq_starts != nullptr) {
offset = seq_starts[bid] / block_size;
}
const int32_t* batch_block_table = block_table + batch_offset + offset;
// Adjust dst pointer based on the cumulative sequence lengths.
dst += seq_start * dst_entry_stride;
if (is_last_split) {
partial_block_size = seq_len % block_size;
if (partial_block_size) full_blocks_end -= 1;
}
auto copy_entry = [&](const cache_t* __restrict__ _src,
scalar_t* __restrict__ _dst) {
for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
#pragma unroll
for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) {
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
_dst[i] = static_cast<scalar_t>(_src[i]);
reinterpret_cast<stype*>(dst_)[idx] =
static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
} else {
_dst[i] =
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx];
stype store_val;
#pragma unroll
for (int j = 0; j < vec_size; ++j) {
store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
loaded_val.val[j], *scale);
}
reinterpret_cast<stype*>(dst_)[idx] = store_val;
}
}
};
const auto loop_end =
std::min((int64_t)full_blocks_end, block_table_stride - offset);
for (int pid = split_start; pid < loop_end; ++pid) {
auto block_id = batch_block_table[pid];
auto block_start_ptr = src_cache + block_id * cache_block_stride;
auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
for (int eid = 0; eid < block_size; ++eid) {
copy_entry(block_start_ptr + eid * cache_entry_stride,
block_dst_ptr + eid * dst_entry_stride);
}
}
if (partial_block_size) {
if (offset + full_blocks_end < block_table_stride) {
auto block_id = batch_block_table[full_blocks_end];
auto block_start_ptr = src_cache + block_id * cache_block_stride;
auto block_dst_ptr =
dst + full_blocks_end * block_size * dst_entry_stride;
for (int eid = 0; eid < partial_block_size; ++eid) {
copy_entry(block_start_ptr + eid * cache_entry_stride,
block_dst_ptr + eid * dst_entry_stride);
// process tail
constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
dst_ = dst_ + ENTRY_SIZE - tail_cnt;
src_ = src_ + ENTRY_SIZE - tail_cnt;
#pragma unroll
for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) {
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
dst_[idx] = static_cast<scalar_t>(src_[idx]);
} else {
dst_[idx] =
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale);
}
}
}
@ -1001,34 +989,38 @@ __global__ void gather_and_maybe_dequant_cache(
// SCALAR_T is the data type of the destination tensor.
// CACHE_T is the stored data type of kv-cache.
// KV_DTYPE is the real data type of kv-cache.
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
block_size, entry_size, block_table_stride, cache_block_stride, \
cache_entry_stride, dst_entry_stride, \
reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576, \
thread_block_size> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
token_to_seq.data_ptr<int32_t>(), num_tokens, block_size, \
block_table_stride, cache_block_stride, cache_entry_stride, \
dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
seq_starts_ptr);
// Gather sequences from the cache into the destination tensor.
// - cu_seq_lens contains the cumulative sequence lengths for each batch
// - block_table contains the cache block indices for each sequence
// - token_to_seq contains the back mapping from token_id to batch_id
// - Optionally, seq_starts (if provided) offsets the starting block index by
// (seq_starts[bid] / page_size)
void gather_and_maybe_dequant_cache(
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
int64_t batch_size, const std::string& kv_cache_dtype,
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
int64_t num_tokens, const std::string& kv_cache_dtype,
torch::Tensor const& scale,
std::optional<torch::Tensor> seq_starts = std::nullopt) {
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int32_t block_size = src_cache.size(1);
int32_t entry_size = src_cache.flatten(2, -1).size(2);
int32_t head_dim = dst.size(-1);
TORCH_CHECK(block_table.dtype() == torch::kInt32,
"block_table must be int32");
@ -1038,6 +1030,9 @@ void gather_and_maybe_dequant_cache(
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
"seq_starts must be int32");
}
TORCH_CHECK(head_dim == 576,
"gather_and_maybe_dequant_cache only support the head_dim to 576 "
"for better performance")
TORCH_CHECK(src_cache.device() == dst.device(),
"src_cache and dst must be on the same device");
@ -1055,10 +1050,9 @@ void gather_and_maybe_dequant_cache(
int64_t cache_entry_stride = src_cache.stride(1);
int64_t dst_entry_stride = dst.stride(0);
// Decide on the number of splits based on the batch size.
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
dim3 grid(batch_size, num_splits);
dim3 block(1024);
constexpr int32_t thread_block_size = 64;
dim3 grid(num_tokens);
dim3 block(thread_block_size);
const int32_t* seq_starts_ptr =
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;

View File

@ -847,7 +847,7 @@ struct VecTypeTrait<c10::BFloat16> {
};
#endif
#if !defined(__powerpc__)
#if !defined(__powerpc__) && !defined(__s390x__)
template <>
struct VecTypeTrait<c10::Half> {
using vec_t = vec_op::FP16Vec16;

View File

@ -4,6 +4,7 @@
#include <vecintrin.h>
#include <cmath>
#include <limits>
#include <torch/all.h>
namespace vec_op {
@ -174,8 +175,9 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
}
explicit FP32Vec8(const BF16Vec8& v) {
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
// On big-endian s390x, place BF16 first to get correct byte order
reg.val[0] = (__vector float)vec_mergeh(v.reg, zero);
reg.val[1] = (__vector float)vec_mergel(v.reg, zero);
}
float reduce_sum() const {
@ -189,51 +191,257 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
}
FP32Vec8 exp() const {
// TODO: Vectorize this
AliasReg ar;
ar.reg = reg;
f32x4x4_t ret;
ret.val[0][0] = std::exp(ar.values[0]);
ret.val[0][1] = std::exp(ar.values[1]);
ret.val[0][2] = std::exp(ar.values[2]);
ret.val[0][3] = std::exp(ar.values[3]);
ret.val[1][0] = std::exp(ar.values[4]);
ret.val[1][1] = std::exp(ar.values[5]);
ret.val[1][2] = std::exp(ar.values[6]);
ret.val[1][3] = std::exp(ar.values[7]);
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
f32x4x2_t out;
const __vector float log2e = vec_splats(1.44269504088896341f);
const __vector float one = vec_splats(1.0f);
const __vector float min_x = vec_splats(-87.3f);
const __vector float max_x = vec_splats(88.7f);
// 5th-degree minimax polynomial for 2^r (r in [0,1))
const __vector float c1 = vec_splats(0.6931471805599453f);
const __vector float c2 = vec_splats(0.240226506959101f);
const __vector float c3 = vec_splats(0.05550410866482158f);
const __vector float c4 = vec_splats(0.009618129107628477f);
const __vector float c5 = vec_splats(0.0013333558146428443f);
for (int i = 0; i < 2; i++) {
__vector float x = reg.val[i];
x = vec_max(x, min_x);
x = vec_min(x, max_x);
__vector float y = vec_mul(x, log2e);
__vector float kf = vec_floor(y);
__vector float r = vec_sub(y, kf);
__vector signed int k = vec_signed(kf);
const __vector signed int min_k = vec_splats((signed int)-126);
const __vector signed int max_k = vec_splats((signed int)127);
k = vec_min(vec_max(k, min_k), max_k);
// Build 2^k from exponent bits
__vector signed int exp_int = vec_add(k, vec_splats((signed int)127));
__vector unsigned int bits = (__vector unsigned int)exp_int;
bits = vec_sl(bits, vec_splats((unsigned int)23));
__vector float pow2k = (__vector float)bits;
// Improved minimax polynomial
__vector float poly = vec_madd(c5, r, c4);
poly = vec_madd(poly, r, c3);
poly = vec_madd(poly, r, c2);
poly = vec_madd(poly, r, c1);
poly = vec_madd(poly, r, one);
out.val[i] = vec_mul(pow2k, poly);
}
return FP32Vec8(out);
}
FP32Vec8 tanh() const {
// TODO: Vectorize this
AliasReg ar;
ar.reg = reg;
f32x4x4_t ret;
ret.val[0][0] = std::tanh(ar.values[0]);
ret.val[0][1] = std::tanh(ar.values[1]);
ret.val[0][2] = std::tanh(ar.values[2]);
ret.val[0][3] = std::tanh(ar.values[3]);
ret.val[1][0] = std::tanh(ar.values[4]);
ret.val[1][1] = std::tanh(ar.values[5]);
ret.val[1][2] = std::tanh(ar.values[6]);
ret.val[1][3] = std::tanh(ar.values[7]);
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
// tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
const __vector float one = vec_splats(1.0f);
const __vector float two = vec_splats(2.0f);
const __vector float zero = vec_splats(0.0f);
const __vector float sat =
vec_splats(9.0f); // beyond this, tanh(x) ~ sign(x)
f32x4x2_t out;
for (int i = 0; i < 2; i++) {
__vector float x = reg.val[i];
__vector float ax = vec_abs(x);
// sign(x): +1 or -1
__vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
// saturation mask: |x| > sat
__vector __bool int saturated = vec_cmpgt(ax, sat);
// 2x
__vector float two_x = vec_mul(x, two);
// Build a temporary FP32Vec8 with both lanes = 2x, reuse exp()
f32x4x2_t tmp;
tmp.val[0] = two_x;
tmp.val[1] = two_x;
FP32Vec8 exp_2x_vec(tmp);
FP32Vec8 e2x = exp_2x_vec.exp();
__vector float e = e2x.reg.val[i];
// tanh(x) = (e - 1) / (e + 1)
__vector float num = vec_sub(e, one);
__vector float den = vec_add(e, one);
__vector float t = vec_div(num, den);
// For large |x|, clamp to sign(x)
out.val[i] = vec_sel(t, sign, saturated);
}
return FP32Vec8(out);
}
FP32Vec8 er() const {
// TODO: Vectorize this
AliasReg ar;
ar.reg = reg;
f32x4x4_t ret;
ret.val[0][0] = std::erf(ar.values[0]);
ret.val[0][1] = std::erf(ar.values[1]);
ret.val[0][2] = std::erf(ar.values[2]);
ret.val[0][3] = std::erf(ar.values[3]);
ret.val[1][0] = std::erf(ar.values[4]);
ret.val[1][1] = std::erf(ar.values[5]);
ret.val[1][2] = std::erf(ar.values[6]);
ret.val[1][3] = std::erf(ar.values[7]);
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
// A&S 7.1.26 approximation:
// erf(x) = sign(x) * (1 - ((((a5*t + a4)*t + a3)*t + a2)*t + a1) * t *
// exp(-x^2)) t = 1 / (1 + p*|x|), p = 0.3275911
const __vector float one = vec_splats(1.0f);
const __vector float zero = vec_splats(0.0f);
const __vector float p = vec_splats(0.3275911f);
// Polynomial coeffs
const __vector float a1 = vec_splats(0.254829592f);
const __vector float a2 = vec_splats(-0.284496736f);
const __vector float a3 = vec_splats(1.421413741f);
const __vector float a4 = vec_splats(-1.453152027f);
const __vector float a5 = vec_splats(1.061405429f);
// Threshold where erf(x) ~ sign(x)
const __vector float sat = vec_splats(6.0f);
f32x4x2_t out;
for (int lane = 0; lane < 2; lane++) {
__vector float x = reg.val[lane];
__vector float ax = vec_abs(x);
// sign(x)
__vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
// |x| > 6 → erf(x) = ±1
__vector __bool int saturated = vec_cmpgt(ax, sat);
// t = 1 / (1 + p * |x|)
__vector float t = vec_madd(p, ax, one);
t = vec_div(one, t);
// poly = a5
__vector float poly = a5;
poly = vec_madd(poly, t, a4);
poly = vec_madd(poly, t, a3);
poly = vec_madd(poly, t, a2);
poly = vec_madd(poly, t, a1);
// full polynomial: poly = poly * t
poly = vec_mul(poly, t);
// Compute exp(-x^2)
__vector float x2 = vec_mul(x, x);
__vector float neg_x2 = vec_neg(x2);
f32x4x2_t tmp;
tmp.val[0] = neg_x2;
tmp.val[1] = neg_x2;
FP32Vec8 exp_neg_x2(tmp);
FP32Vec8 e = exp_neg_x2.exp();
__vector float ex = e.reg.val[lane];
// erf(x) = sign * (1 - poly * exp(-x^2))
__vector float term = vec_mul(poly, ex);
__vector float y = vec_sub(one, term);
y = vec_mul(y, sign);
// saturated → ±1
__vector float sat_val = vec_mul(sign, one);
out.val[lane] = vec_sel(y, sat_val, saturated);
}
return FP32Vec8(out);
}
// Elementwise sigmoid(x) = 1 / (1 + exp(-x))
FP32Vec8 sigmoid() const {
const __vector float one = vec_splats(1.0f);
f32x4x2_t neg;
for (int i = 0; i < 2; ++i) {
neg.val[i] = vec_neg(reg.val[i]);
}
FP32Vec8 neg_x(neg);
FP32Vec8 e = neg_x.exp(); // exp(-x)
f32x4x2_t denom;
for (int i = 0; i < 2; ++i) {
denom.val[i] = vec_add(one, e.reg.val[i]);
}
FP32Vec8 denom_vec(denom);
FP32Vec8 one_vec(1.0f);
return one_vec / denom_vec;
}
// Tanh-based GELU:
// gelu(x) = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x^3)))
FP32Vec8 gelu_tanh() const {
const __vector float k_s2pi = vec_splats(0.7978845608028654f); // √(2/π)
const __vector float k_0_0447 = vec_splats(0.044715f);
f32x4x2_t x2, x3, inner;
for (int i = 0; i < 2; ++i) {
__vector float x = reg.val[i];
x2.val[i] = vec_mul(x, x); // x^2
x3.val[i] = vec_mul(x2.val[i], x); // x^3
__vector float t = vec_madd(k_0_0447, x3.val[i], x); // x + 0.044715*x^3
inner.val[i] = vec_mul(k_s2pi, t); // √(2/π)*(...)
}
FP32Vec8 inner_vec(inner);
FP32Vec8 t = inner_vec.tanh(); // tanh part
FP32Vec8 one_vec(1.0f);
FP32Vec8 half_vec(0.5f);
FP32Vec8 x_vec(*this);
return x_vec * half_vec * (one_vec + t);
}
// Erf-based GELU:
// gelu(x) = 0.5 * x * (1 + erf(x / √2))
FP32Vec8 gelu_erf() const {
const __vector float inv_sqrt2 = vec_splats(0.7071067811865476f); // 1/√2
FP32Vec8 x_vec(*this);
f32x4x2_t scaled;
for (int i = 0; i < 2; ++i) {
scaled.val[i] = vec_mul(reg.val[i], inv_sqrt2);
}
FP32Vec8 x_scaled(scaled);
FP32Vec8 erf_x = x_scaled.er();
FP32Vec8 one_vec(1.0f);
FP32Vec8 half_vec(0.5f);
return x_vec * half_vec * (one_vec + erf_x);
}
// Elementwise reciprocal: 1/x (scalar per lane, for correctness)
FP32Vec8 rcp() const {
AliasReg in, out;
in.reg = reg;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
out.values[i] = 1.0f / in.values[i];
}
return FP32Vec8(out.reg);
}
// Elementwise rsqrt(x) = 1 / sqrt(x) (scalar per lane, for correctness)
FP32Vec8 rsqrt() const {
AliasReg in, out;
in.reg = reg;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
out.values[i] = 1.0f / std::sqrt(in.values[i]);
}
return FP32Vec8(out.reg);
}
FP32Vec8 operator*(const FP32Vec8& b) const {
@ -316,10 +524,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
}
explicit FP32Vec16(const BF16Vec16& v) {
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
// On big-endian s390x, place BF16 first to get correct byte order
reg.val[0] = (__vector float)vec_mergeh(v.reg.val[0], zero);
reg.val[1] = (__vector float)vec_mergel(v.reg.val[0], zero);
reg.val[2] = (__vector float)vec_mergeh(v.reg.val[1], zero);
reg.val[3] = (__vector float)vec_mergel(v.reg.val[1], zero);
}
explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
@ -376,6 +585,23 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
return result;
}
FP32Vec16 max(const FP32Vec16& b) const {
return FP32Vec16(f32x4x4_t({vec_max(reg.val[0], b.reg.val[0]),
vec_max(reg.val[1], b.reg.val[1]),
vec_max(reg.val[2], b.reg.val[2]),
vec_max(reg.val[3], b.reg.val[3])}));
}
float reduce_max() const {
AliasReg ar;
ar.reg = reg;
float result = ar.values[0];
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) {
if (ar.values[i] > result) result = ar.values[i];
});
return result;
}
void save(float* ptr) const {
vec_xst(reg.val[0], 0, ptr);
vec_xst(reg.val[1], 16, ptr);
@ -402,15 +628,14 @@ struct VecType<c10::BFloat16> {
using vec_type = BF16Vec8;
};
// On s390x, FP16 (Half) is not natively supported, use FP32 vectors instead
using FP16Vec16 = FP32Vec16;
template <typename T>
void storeFP32(float v, T* ptr) {
*ptr = v;
}
inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
acc = acc + a * b;
}
namespace c10 {
struct BFloat16 {
uint16_t value; // Assume BFloat16 is defined as a struct containing a 16-bit
@ -429,6 +654,79 @@ inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
#define __VEC_CLASS_FP_NAN (1 << 6)
#endif
// Optimized FMA (Fused Multiply-Add) implementations using IBM Z vector
// intrinsics
// FP32Vec4 FMA: acc = acc + (a * b) or equivalently acc = fma(a, b, acc)
FORCE_INLINE void fma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
acc.reg = vec_madd(a.reg, b.reg, acc.reg);
}
// FP32Vec8 FMA: acc = acc + (a * b)
FORCE_INLINE void fma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
}
// FP32Vec16 FMA: acc = acc + (a * b)
FORCE_INLINE void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
acc.reg.val[2] = vec_madd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
acc.reg.val[3] = vec_madd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
}
// Multiply-Subtract: acc = acc - (a * b)
FORCE_INLINE void fms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
acc.reg = vec_msub(a.reg, b.reg, acc.reg);
}
FORCE_INLINE void fms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
}
FORCE_INLINE void fms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
acc.reg.val[2] = vec_msub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
acc.reg.val[3] = vec_msub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
}
// Negative Multiply-Add: acc = -(a * b) + acc
FORCE_INLINE void nfma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
acc.reg = vec_nmadd(a.reg, b.reg, acc.reg);
}
FORCE_INLINE void nfma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
}
FORCE_INLINE void nfma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
acc.reg.val[2] = vec_nmadd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
acc.reg.val[3] = vec_nmadd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
}
// Negative Multiply-Subtract: acc = -(a * b) - acc
FORCE_INLINE void nfms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
acc.reg = vec_nmsub(a.reg, b.reg, acc.reg);
}
FORCE_INLINE void nfms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
}
FORCE_INLINE void nfms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
acc.reg.val[2] = vec_nmsub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
acc.reg.val[3] = vec_nmsub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
}
const static __vector unsigned char omask = {2, 3, 6, 7, 10, 11, 14, 15,
18, 19, 22, 23, 26, 27, 30, 31};
const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
@ -441,13 +739,24 @@ const static __vector unsigned int one = {1, 1, 1, 1};
inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
__vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
__vector unsigned int lsb0 = inp0 >> sh16;
__vector unsigned int lsb1 = inp1 >> sh16;
lsb0 = lsb0 & one;
lsb1 = lsb1 & one;
__vector unsigned int rnd0 = lsb0 + bias;
__vector unsigned int rnd1 = lsb1 + bias;
inp0 = inp0 + rnd0;
inp1 = inp1 + rnd1;
int cc;
__vector __bool int sel0 =
vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
__vector __bool int sel1 =
vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
inp0 = vec_sel(inp0, nan, sel0) >> sh16;
inp1 = vec_sel(inp1, nan, sel1) >> sh16;
inp0 = vec_sel(inp0, nan, sel0);
inp1 = vec_sel(inp1, nan, sel1);
inp0 = inp0 >> sh16;
inp1 = inp1 >> sh16;
reg = (__vector signed short)vec_perm(inp0, inp1, omask);
}
@ -456,6 +765,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
__vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
__vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
__vector unsigned int lsb0 = inp0 >> sh16;
__vector unsigned int lsb1 = inp1 >> sh16;
__vector unsigned int lsb2 = inp2 >> sh16;
__vector unsigned int lsb3 = inp3 >> sh16;
lsb0 = lsb0 & one;
lsb1 = lsb1 & one;
lsb2 = lsb2 & one;
lsb3 = lsb3 & one;
__vector unsigned int rnd0 = lsb0 + bias;
__vector unsigned int rnd1 = lsb1 + bias;
__vector unsigned int rnd2 = lsb2 + bias;
__vector unsigned int rnd3 = lsb3 + bias;
inp0 = inp0 + rnd0;
inp1 = inp1 + rnd1;
inp2 = inp2 + rnd2;
inp3 = inp3 + rnd3;
int cc;
__vector __bool int sel0 =
vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
@ -465,15 +790,164 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc);
__vector __bool int sel3 =
vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc);
inp0 = vec_sel(inp0, nan, sel0) >> sh16;
inp1 = vec_sel(inp1, nan, sel1) >> sh16;
inp2 = vec_sel(inp2, nan, sel2) >> sh16;
inp3 = vec_sel(inp3, nan, sel3) >> sh16;
inp0 = vec_sel(inp0, nan, sel0);
inp1 = vec_sel(inp1, nan, sel1);
inp2 = vec_sel(inp2, nan, sel2);
inp3 = vec_sel(inp3, nan, sel3);
inp0 = inp0 >> sh16;
inp1 = inp1 >> sh16;
inp2 = inp2 >> sh16;
inp3 = inp3 >> sh16;
reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
}
inline void prefetch(const void* addr) { void __dcbt(const void* addr); }
// 1D softmax over `n` elements in `input`, writes result to `output`.
// Uses FP32Vec8 for main body, scalar tail handling.
// Requirement: n > 0
FORCE_INLINE void softmax_fp32vec8(float* output, const float* input, int n) {
if (n <= 0) return;
// ---------- Pass 1: find max ----------
float max_val = -std::numeric_limits<float>::infinity();
int i = 0;
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
FP32Vec8 v(input + i);
FP32Vec8::AliasReg ar;
ar.reg = v.reg;
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
if (ar.values[j] > max_val) max_val = ar.values[j];
}
}
for (; i < n; ++i) {
if (input[i] > max_val) max_val = input[i];
}
// ---------- Pass 2: compute exp(x - max) and sum ----------
float sum = 0.0f;
i = 0;
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
float tmp[FP32Vec8::VEC_ELEM_NUM];
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
tmp[j] = input[i + j] - max_val;
}
FP32Vec8 v(tmp);
FP32Vec8 e = v.exp();
FP32Vec8::AliasReg ar;
ar.reg = e.reg;
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
output[i + j] = ar.values[j];
sum += ar.values[j];
}
}
// Tail
for (; i < n; ++i) {
float x = input[i] - max_val;
float ex = std::exp(x); // scalar tail
output[i] = ex;
sum += ex;
}
// ---------- Pass 3: normalize ----------
float inv_sum = 1.0f / sum;
i = 0;
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
float tmp[FP32Vec8::VEC_ELEM_NUM];
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
tmp[j] = output[i + j] * inv_sum;
}
FP32Vec8 v(tmp);
v.save(output + i);
}
for (; i < n; ++i) {
output[i] *= inv_sum;
}
}
// 1D RMSNorm kernel:
// input: x[0..n-1]
// weight: w[0..n-1] (gamma), may be nullptr
// output: y[i] = x[i] * inv_rms * (weight[i] if weight != nullptr else 1)
// eps: small epsilon for numerical stability
FORCE_INLINE void rmsnorm_fp32vec8(float* output, const float* input,
const float* weight, int n, float eps) {
if (n <= 0) return;
// ---------- Pass 1: compute sum of squares ----------
float sum_sq = 0.0f;
int i = 0;
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
FP32Vec8 x_vec(input + i);
FP32Vec8 sq = x_vec * x_vec;
FP32Vec8::AliasReg ar;
ar.reg = sq.reg;
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
sum_sq += ar.values[j];
}
}
// Tail
for (; i < n; ++i) {
float v = input[i];
sum_sq += v * v;
}
float mean_sq = sum_sq / static_cast<float>(n);
float inv_rms = 1.0f / std::sqrt(mean_sq + eps);
// ---------- Pass 2: scale (and apply weight if given) ----------
const float inv_rms_f = inv_rms;
i = 0;
if (weight) {
// with gamma
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
FP32Vec8 x_vec(input + i);
float wtmp[FP32Vec8::VEC_ELEM_NUM];
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
wtmp[j] = weight[i + j];
}
FP32Vec8 w_vec(wtmp);
FP32Vec8 scale_vec(inv_rms_f);
FP32Vec8 y = x_vec * scale_vec * w_vec;
y.save(output + i);
}
for (; i < n; ++i) {
output[i] = input[i] * inv_rms_f * weight[i];
}
} else {
// without gamma
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
FP32Vec8 x_vec(input + i);
FP32Vec8 scale_vec(inv_rms_f);
FP32Vec8 y = x_vec * scale_vec;
y.save(output + i);
}
for (; i < n; ++i) {
output[i] = input[i] * inv_rms_f;
}
}
}
// Prefetch data to cache for better memory access performance
FORCE_INLINE void prefetch(const void* addr) {
__builtin_prefetch(addr, 0, 3); // 0=read, 3=high temporal locality
}
}; // namespace vec_op

View File

@ -489,14 +489,16 @@ __global__ void Marlin(
#pragma unroll
for (int i = 0; i < 4; i++) {
int idx = tid4 * 4 + i;
idx = idx < block_num_valid_tokens ? idx : 0;
if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
sh_block_topk_weights[idx] = __hmul2(
global_scale, Dtype::num2num2(Dtype::float2num(
topk_weights_ptr[sh_block_sorted_ids[idx]])));
} else {
sh_block_topk_weights[idx] = Dtype::num2num2(
Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
if (idx < block_num_valid_tokens) {
if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
sh_block_topk_weights[idx] =
__hmul2(global_scale,
Dtype::num2num2(Dtype::float2num(
topk_weights_ptr[sh_block_sorted_ids[idx]])));
} else {
sh_block_topk_weights[idx] = Dtype::num2num2(
Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
}
}
}
}

View File

@ -52,14 +52,13 @@ void paged_attention_v2(
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
const int64_t blocksparse_head_sliding_step);
#ifndef USE_ROCM
void merge_attn_states(torch::Tensor& output,
std::optional<torch::Tensor> output_lse,
const torch::Tensor& prefix_output,
const torch::Tensor& prefix_lse,
const torch::Tensor& suffix_output,
const torch::Tensor& suffix_lse);
#ifndef USE_ROCM
void convert_vertical_slash_indexes(
torch::Tensor& block_count, // [BATCH, N_HEADS, NUM_ROWS]
torch::Tensor& block_offset, // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]

View File

@ -22,6 +22,7 @@
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#include "cutlass_extensions/common.hpp"
#include "cute/tensor.hpp"
#include "cutlass/tensor_ref.h"
@ -173,7 +174,7 @@ void run_get_group_gemm_starts(
}
template <typename OutType>
void run_fp4_blockwise_scaled_group_mm(
void run_fp4_blockwise_scaled_group_mm_sm100(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
@ -343,17 +344,225 @@ void run_fp4_blockwise_scaled_group_mm(
auto can_implement_status = gemm_op.can_implement(args);
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
"Failed to implement GEMM");
"Failed to implement GEMM: status=", (int)can_implement_status);
// Run the GEMM
auto status = gemm_op.initialize(args, workspace.data_ptr());
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
TORCH_CHECK(status == cutlass::Status::kSuccess,
"Failed to initialize GEMM: status=", (int)status,
" workspace_size=", workspace_size, " num_experts=", num_experts,
" M=", M, " N=", N, " K=", K);
status = gemm_op.run(args, workspace.data_ptr(), stream);
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
}
void run_fp4_blockwise_scaled_group_mm_sm120(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
int N, int K) {
using ProblemShape =
cutlass::gemm::GroupProblemShape<Shape<int32_t, int32_t, int32_t>>;
using ElementType = cutlass::float_e2m1_t;
using ElementSFType = cutlass::float_ue4m3_t;
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
// NOTE: For SM120 it seems templating the output type is not supported and
// we need to hardcode the output type to bfloat16
using ElementC = cutlass::bfloat16_t;
using ElementD = ElementC;
using ElementAccumulator = float;
// Layout definitions
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::RowMajor;
using LayoutD = LayoutC;
// Alignment constraints
static constexpr int AlignmentA = 32;
static constexpr int AlignmentB = 32;
static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
// Architecture definitions
using ArchTag = cutlass::arch::Sm120;
using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
using ClusterShape = Shape<_1, _1, _1>;
using MmaTileShape = Shape<_128, _128, _128>;
using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
ElementD, ElementAccumulator, ElementC, ElementAccumulator>;
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag, OperatorClass, MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
LayoutD*, AlignmentD,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation>::CollectiveOp;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass, ElementA, LayoutA*, AlignmentA, ElementB,
LayoutB*, AlignmentB, ElementAccumulator, MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
using GemmKernel =
cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
CollectiveEpilogue>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
using StrideA = typename Gemm::GemmKernel::InternalStrideA;
using StrideB = typename Gemm::GemmKernel::InternalStrideB;
using StrideC = typename Gemm::GemmKernel::InternalStrideC;
using StrideD = typename Gemm::GemmKernel::InternalStrideD;
using LayoutSFA =
typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
using LayoutSFB =
typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
using ScaleConfig =
typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
int num_experts = static_cast<int>(expert_offsets.size(0));
auto options_int =
torch::TensorOptions().dtype(torch::kInt64).device(a.device());
torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
torch::Tensor c_strides1 =
torch::full({num_experts}, output.stride(0), options_int);
torch::Tensor a_strides1 =
torch::full({num_experts}, a.stride(0) * 2, options_int);
torch::Tensor b_strides1 =
torch::full({num_experts}, b.stride(1) * 2, options_int);
run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
expert_offsets, sf_offsets, problem_sizes, M, N, K);
// Create an instance of the GEMM
Gemm gemm_op;
// Initialize problem_sizes_as_shapes correctly
UnderlyingProblemShape* problem_sizes_as_shapes =
static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
// Set the Scheduler info
cutlass::KernelHardwareInfo hw_info;
using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
scheduler.raster_order = RasterOrderOptions::AlongM;
hw_info.device_id = a.get_device();
static std::unordered_map<int, int> cached_sm_counts;
if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) {
cached_sm_counts[hw_info.device_id] =
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
hw_info.device_id);
}
hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX);
// Mainloop Arguments
typename GemmKernel::MainloopArguments mainloop_args{
static_cast<const ElementType**>(a_ptrs.data_ptr()),
static_cast<StrideA*>(a_strides1.data_ptr()),
static_cast<const ElementType**>(b_ptrs.data_ptr()),
static_cast<StrideB*>(b_strides1.data_ptr()),
static_cast<const ElementSFType**>(a_scales_ptrs.data_ptr()),
reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
static_cast<const ElementSFType**>(b_scales_ptrs.data_ptr()),
reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())};
// Epilogue Arguments
typename GemmKernel::EpilogueArguments epilogue_args{
{}, // epilogue.thread
nullptr,
static_cast<StrideC*>(c_strides1.data_ptr()),
static_cast<ElementD**>(out_ptrs.data_ptr()),
static_cast<StrideC*>(c_strides1.data_ptr())};
auto& fusion_args = epilogue_args.thread;
fusion_args.alpha_ptr_array =
reinterpret_cast<float**>(alpha_ptrs.data_ptr());
fusion_args.dAlpha = {_0{}, _0{}, 1};
fusion_args.beta = 0.0f;
// Gemm Arguments
typename GemmKernel::Arguments args{
cutlass::gemm::GemmUniversalMode::kGrouped,
{num_experts, problem_sizes_as_shapes, nullptr},
mainloop_args,
epilogue_args,
hw_info,
scheduler};
size_t workspace_size = Gemm::get_workspace_size(args);
auto const workspace_options =
torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
auto workspace = torch::empty(workspace_size, workspace_options);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device());
auto can_implement_status = gemm_op.can_implement(args);
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
"Failed to implement GEMM: status=", (int)can_implement_status);
// Run the GEMM
auto status = gemm_op.initialize(args, workspace.data_ptr());
TORCH_CHECK(status == cutlass::Status::kSuccess,
"Failed to initialize GEMM: status=", (int)status,
" workspace_size=", workspace_size, " num_experts=", num_experts,
" M=", M, " N=", N, " K=", K);
status = gemm_op.run(args, workspace.data_ptr(), stream);
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
}
template <typename OutType>
void run_fp4_blockwise_scaled_group_mm(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
int N, int K) {
int32_t version_num = get_sm_version_num();
#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
if (version_num >= 120 && version_num < 130) {
run_fp4_blockwise_scaled_group_mm_sm120(
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
expert_offsets, sf_offsets, M, N, K);
return;
}
#endif
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
if (version_num >= 100 && version_num < 120) {
run_fp4_blockwise_scaled_group_mm_sm100<OutType>(
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
expert_offsets, sf_offsets, M, N, K);
return;
}
#endif
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"No compiled cutlass_fp4_group_mm kernel for CUDA device capability: ",
version_num, ". Required capability: 100 or 120");
}
#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
(defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
#endif
@ -374,7 +583,8 @@ void cutlass_fp4_group_mm(
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
(defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
// Input validation
CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
@ -408,6 +618,14 @@ void cutlass_fp4_group_mm(
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
expert_offsets, sf_offsets, M, N, K);
} else {
#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
int32_t version_num = get_sm_version_num();
if (version_num >= 120 && version_num < 130) {
TORCH_CHECK_NOT_IMPLEMENTED(
false, "SM120 NVFP4 MOE only supports bfloat16 output, got: ",
output.scalar_type());
}
#endif
run_fp4_blockwise_scaled_group_mm<cutlass::half_t>(
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
expert_offsets, sf_offsets, M, N, K);
@ -416,8 +634,8 @@ void cutlass_fp4_group_mm(
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"No compiled cutlass_fp4_group_mm kernel, vLLM must "
"be compiled with ENABLE_NVFP4_SM100 for SM100+ and CUDA "
"12.8 or above.");
"be compiled with ENABLE_NVFP4_SM100 or ENABLE_NVFP4_SM120 for SM100/120 "
"and CUDA 12.8 or above.");
#endif
}

View File

@ -307,7 +307,7 @@ constexpr auto FLOAT = at::ScalarType::Float;
constexpr auto INT = at::ScalarType::Int;
constexpr auto UINT8 = at::ScalarType::Byte;
void scaled_fp4_experts_quant_sm100a(
void scaled_fp4_experts_quant_sm1xxa(
torch::Tensor& output, torch::Tensor& output_scale,
torch::Tensor const& input, torch::Tensor const& input_global_scale,
torch::Tensor const& input_offset_by_experts,

View File

@ -24,8 +24,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
torch::Tensor const& input_sf);
#endif
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
void scaled_fp4_experts_quant_sm100a(
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
void scaled_fp4_experts_quant_sm1xxa(
torch::Tensor& output, torch::Tensor& output_scale,
torch::Tensor const& input, torch::Tensor const& input_global_scale,
torch::Tensor const& input_offset_by_experts,
@ -54,8 +55,9 @@ void scaled_fp4_experts_quant(
torch::Tensor const& input, torch::Tensor const& input_global_scale,
torch::Tensor const& input_offset_by_experts,
torch::Tensor const& output_scale_offset_by_experts) {
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
return scaled_fp4_experts_quant_sm100a(
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
return scaled_fp4_experts_quant_sm1xxa(
output, output_scale, input, input_global_scale, input_offset_by_experts,
output_scale_offset_by_experts);
#endif

View File

@ -67,9 +67,9 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
std::optional<torch::Tensor> const& bias);
#endif
#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
#if (defined(ENABLE_CUTLASS_MOE_SM90) && ENABLE_CUTLASS_MOE_SM90) || \
(defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100) || \
(defined(ENABLE_CUTLASS_MOE_SM120) && ENABLE_CUTLASS_MOE_SM120)
void get_cutlass_moe_mm_data_caller(
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@ -284,8 +284,9 @@ void get_cutlass_moe_mm_data(
// This function currently gets compiled only if we have a valid cutlass moe
// mm to run it for.
int32_t version_num = get_sm_version_num();
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
problem_sizes2, input_permutation,
output_permutation, num_experts, n, k,
@ -296,7 +297,7 @@ void get_cutlass_moe_mm_data(
false,
"No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
"CUDA device capability: ",
version_num, ". Required capability: 90 or 100");
version_num, ". Required capability: 90, 100, or 120");
}
void get_cutlass_moe_mm_problem_sizes(
@ -304,8 +305,9 @@ void get_cutlass_moe_mm_problem_sizes(
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
int32_t version_num = get_sm_version_num();
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
problem_sizes2, num_experts, n, k,
blockscale_offsets);
@ -315,7 +317,7 @@ void get_cutlass_moe_mm_problem_sizes(
false,
"No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
"kernel for CUDA device capability: ",
version_num, ". Required capability: 90 or 100");
version_num, ". Required capability: 90, 100, or 120");
}
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
@ -328,8 +330,9 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
// This function currently gets compiled only if we have a valid cutlass moe
// mm to run it for.
int32_t version_num = get_sm_version_num();
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
problem_sizes2, expert_num_tokens,
num_local_experts, padded_m, n, k);
@ -339,7 +342,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
false,
"No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
"for CUDA device capability: ",
version_num, ". Required capability: 90 or 100");
version_num, ". Required capability: 90, 100, or 120");
}
void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,

View File

@ -63,7 +63,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" int blocksparse_head_sliding_step) -> ()");
ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
#ifndef USE_ROCM
// Merge attn states
// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
// can be used to combine partial attention results (in the split-KV case)
@ -76,7 +75,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor suffix_output,"
" Tensor suffix_lse) -> ()");
ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
#ifndef USE_ROCM
ops.def(
"convert_vertical_slash_indexes("
" Tensor! block_count, Tensor! block_offset, "
@ -695,7 +694,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
cache_ops.def(
"gather_and_maybe_dequant_cache(Tensor src_cache, Tensor! dst, "
" Tensor block_table, Tensor cu_seq_lens, "
" int batch_size, "
" Tensor token_to_seq, "
" int num_tokens, "
" str kv_cache_dtype, "
" Tensor scale, Tensor? seq_starts) -> ()");
cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,

View File

@ -20,8 +20,8 @@ ARG PYTHON_VERSION=3.12
# glibc version is baked into the distro, and binaries built with one glibc
# version are not backwards compatible with OSes that use an earlier version.
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# TODO: Restore to base image after FlashInfer AOT wheel fixed
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
# their own mirror. When doing so, we don't benefit from the transparent
@ -85,7 +85,20 @@ ARG GET_PIP_URL
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo python3-pip libibverbs-dev \
&& apt-get install -y --no-install-recommends \
ccache \
software-properties-common \
git \
curl \
sudo \
python3-pip \
libibverbs-dev \
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
gcc-10 \
g++-10 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
&& rm -rf /var/lib/apt/lists/* \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
@ -110,10 +123,6 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
RUN apt-get install -y gcc-10 g++-10
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
RUN <<EOF
gcc --version
EOF
@ -235,9 +244,15 @@ RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
# Install EP kernels(pplx-kernels and DeepEP)
ARG PPLX_COMMIT_HASH
ARG DEEPEP_COMMIT_HASH
RUN --mount=type=cache,target=/root/.cache/uv \
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
/tmp/install_python_libraries.sh /tmp/ep_kernels_workspace wheel && \
/tmp/install_python_libraries.sh \
--workspace /tmp/ep_kernels_workspace \
--mode wheel \
${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
# Check the size of the wheel if RUN_WHEEL_CHECK is true
@ -268,7 +283,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
ENV UV_LINK_MODE=copy
# Install libnuma-dev, required by fastsafetensors (fixes #20384)
RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*
COPY requirements/lint.txt requirements/lint.txt
COPY requirements/test.txt requirements/test.txt
COPY requirements/dev.txt requirements/dev.txt
@ -305,8 +320,15 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y software-properties-common curl sudo python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& apt-get install -y --no-install-recommends \
software-properties-common \
curl \
sudo \
python3-pip \
ffmpeg \
libsm6 \
libxext6 \
libgl1 \
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
mkdir -p -m 0755 /etc/apt/keyrings ; \
@ -321,13 +343,30 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
done ; \
fi \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
&& apt-get install -y --no-install-recommends \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-dev \
python${PYTHON_VERSION}-venv \
libibverbs-dev \
&& rm -rf /var/lib/apt/lists/* \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Install CUDA development tools and build essentials for runtime JIT compilation
# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
apt-get update -y && \
apt-get install -y --no-install-recommends \
cuda-nvcc-${CUDA_VERSION_DASH} \
cuda-cudart-${CUDA_VERSION_DASH} \
cuda-nvrtc-${CUDA_VERSION_DASH} \
cuda-cuobjdump-${CUDA_VERSION_DASH} \
libcublas-${CUDA_VERSION_DASH} && \
rm -rf /var/lib/apt/lists/*
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
@ -359,8 +398,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
# Install FlashInfer pre-compiled kernel cache and binaries
# https://docs.flashinfer.ai/installation.html
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system flashinfer-cubin==0.5.2 \
&& uv pip install --system flashinfer-jit-cache==0.5.2 \
uv pip install --system flashinfer-cubin==0.5.3 \
&& uv pip install --system flashinfer-jit-cache==0.5.3 \
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
&& flashinfer show-config

View File

@ -119,7 +119,6 @@ FROM base AS vllm-test-deps
WORKDIR /workspace/vllm
# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
cp requirements/test.in requirements/cpu-test.in && \
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
@ -132,9 +131,6 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
esac; \
}; \
remove_packages_not_supported_on_aarch64 && \
sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
RUN --mount=type=cache,target=/root/.cache/uv \

View File

@ -76,34 +76,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/common.txt
# must put before installing xformers, so it can install the correct version of xfomrers.
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Build xformers with cuda and torch nightly
# following official xformers guidance: https://github.com/facebookresearch/xformers#build
# todo(elainewy): cache xformers build result for faster build
ARG max_jobs=16
ENV MAX_JOBS=${max_jobs}
ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/uv \
echo 'git clone xformers...' \
&& git clone https://github.com/facebookresearch/xformers.git --recursive \
&& cd xformers \
&& git checkout ${XFORMERS_COMMIT} \
&& git submodule update --init --recursive \
&& echo 'finish git clone xformers...' \
&& rm -rf build \
&& python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
&& cd .. \
&& rm -rf xformers
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system xformers-dist/*.whl --verbose
# build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
@ -233,11 +205,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system vllm-dist/*.whl --verbose
# install xformers again for the new environment
RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
# install package for build flashinfer
@ -307,7 +274,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/nightly_torch_test.txt
# Logging to confirm the torch versions
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
RUN pip freeze | grep -E 'torch|vllm|flashinfer'
# Logging to confirm all the packages are installed
RUN pip freeze

View File

@ -1,4 +1,4 @@
FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
@ -25,10 +25,14 @@ RUN apt clean && apt-get update -y && \
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing
RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc
# This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.6/intel-oneccl-2021.15.6.9_offline.sh
RUN bash intel-oneccl-2021.15.6.9_offline.sh -a --silent --eula accept && \
echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \
echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
SHELL ["bash", "-c"]
CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
@ -72,6 +76,7 @@ RUN python3 -m pip install -e tests/vllm_test_utils
ENV NIXL_VERSION=0.7.0
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
# remove torch bundled oneccl to avoid conflicts
RUN --mount=type=cache,target=/root/.cache/pip \
pip uninstall oneccl oneccl-devel -y

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 KiB

After

Width:  |  Height:  |  Size: 146 KiB

View File

@ -10,6 +10,7 @@ Stay tuned for upcoming meetups! Follow us on [Twitter/X](https://x.com/vllm_pro
Below you'll find slides and recordings from our previous meetups:
- [vLLM Bangkok Meetup](https://luma.com/v0f647nv), November 21st 2025. [[Slides]](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing)
- [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
- [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link)
- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6)

View File

@ -98,21 +98,6 @@ to warm it up so that future builds are faster.
<img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
</p>
## Update dependencies
Several vLLM dependencies like xFormers depend on PyTorch and need
to be updated accordingly. Rather than waiting for all of them to publish new
releases (which would take too much time), they can be built from
source to unblock the update process.
### xFormers
```bash
export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
MAX_JOBS=16 uv pip install --system \
--no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
```
## Update all the different vLLM platforms
Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable

View File

@ -29,7 +29,7 @@ The initialization code should look like this:
```python
from torch import nn
from vllm.config import VllmConfig
from vllm.attention import Attention
from vllm.attention.layer import Attention
class MyAttention(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str):

View File

@ -2,4 +2,4 @@
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.

View File

@ -84,12 +84,14 @@ See the following figures for a quick comparison between the previous and curren
```python
class BatchDescriptor(NamedTuple):
num_tokens: int
uniform_decode: bool = False
num_reqs: int
uniform: bool = False
has_lora: bool = False
```
where `num_tokens` can be the padded token length, and `uniform_decode` is determined by if `max_query_len` of a batch is equal to the desired `max_query_len` of a uniform_decode, and the num_scheduled_tokens is divisible by that desired `max_query_len`.
where `num_tokens` can be the padded token length, and `uniform` indicates if all the requests have the same query lengths. Many attention backends only support full cudagraphs when the batches are uniform; pure decode batches are uniform but may not be query length 1 (i.e. `num_tokens == num_reqs`), this occurs in the validation pass of spec-decode where "decode" batches will have a query length of `1+num_spec_tokens`.
The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item.
!!! note
The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).

View File

@ -151,6 +151,76 @@ To avoid this, please either:
2. wrap the branching logic into a custom operator. TorchDynamo does not
trace into custom operators.
## Debugging constraint violations and dynamic shapes guards issues
Dynamic-shape guards are a specific category of Dynamo guards. They are constraints that `torch.compile`
attaches to dynamic dimensions (e.g., `seq_len`) to ensure the compiled artifact remains valid.
These guards typically appear when framework code, custom passes, or user code branches based on
dynamic shape values.
**Example:**
```python
if x > 10:
# path A
else:
# path B
```
This creates a guard `x > 10` or `x <= 10` depending on which path was traced.
**vLLM's Assumption:**
vLLM assumes that all guards added by torch.compile are safe to drop and will not
constrain the compiled graph to specific input shapes. When this assumption is violated,
it can cause issues that users need to debug.
Some side effects that indicates this assumption is violated are runtime errors
or `ConstraintViolationErrors`.
A `ConstraintViolationErrors` will be thrown if a dynamic shape gets constrained to
a single value. If you encounter a constraint violation error or suspect that a dynamic
shapes guard is being added incorrectly, you can use stricter dynamic shape modes to
help debug the issue:
```sh
# Online - using unbacked mode
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
# Online - using backed_size_oblivious mode
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=backed_size_oblivious
```
```py
# Offline - using unbacked mode
from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
LLM(model, compilation_config=CompilationConfig(
dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.UNBACKED)
))
# Offline - using backed_size_oblivious mode
from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
LLM(model, compilation_config=CompilationConfig(
dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS)
))
```
These modes are stricter and reduce or eliminate the need of dynamic shapes guarding, which can help isolate issues:
- `unbacked`: Uses unbacked symints which don't allow guards, making it easier to identify where guards are being incorrectly added
- `backed_size_oblivious`: Uses a mode that is more strict about guarding.
For more details on dynamic shapes modes, see [Dynamic shapes and vLLM guard dropping](torch_compile.md#dynamic-shapes-and-vllm-guard-dropping).
### Printing guards
To see all guards that are being added during compilation, you can use `TORCH_LOGS=+dynamic`:
```sh
TORCH_LOGS=+dynamic vllm serve meta-llama/Llama-3.2-1B
```
Look for `[guard added]` in the logs to see where guards are being added. This can help you identify which operations are
causing guards to be added incorrectly.
## Debugging TorchInductor
TorchInductor takes a captured graph and then compiles it down to some Python code

View File

@ -60,7 +60,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
- [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
- [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod]
- [`CompressedTensorsW4A4Nvfp4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoeMethod]
- [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
- [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
- [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]

View File

@ -0,0 +1,69 @@
<!-- markdownlint-disable -->
# Optimization Levels
## Overview
vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechnaism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out of the box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
## Level Summaries and Usage Examples
```bash
# CLI usage
python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O0
# Python API usage
from vllm.entrypoints.llm import LLM
llm = LLM(
model="RedHatAI/Llama-3.2-1B-FP8",
optimization_level=0
)
```
#### `-O1`: Quick Optimizations
- **Startup**: Moderate startup time
- **Performance**: Inductor compilation, CUDAGraphMode.PIECEWISE
- **Use case**: Balance for most development scenarios
```bash
# CLI usage
python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O1
# Python API usage
from vllm.entrypoints.llm import LLM
llm = LLM(
model="RedHatAI/Llama-3.2-1B-FP8",
optimization_level=1
)
```
#### `-O2`: Full Optimizations (Default)
- **Startup**: Longer startup time
- **Performance**: `-O1` + CUDAGraphMode.FULL_AND_PIECEWISE
- **Use case**: Production workloads where performance is important. This is the default use case. It is also very similar to the previous default. The primary difference is that noop & fusion flags are enabled.
```bash
# CLI usage (default, so optional)
python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O2
# Python API usage
from vllm.entrypoints.llm import LLM
llm = LLM(
model="RedHatAI/Llama-3.2-1B-FP8",
optimization_level=2 # This is the default
)
```
#### `-O3`: Full Optimization
Still in development. Added infrastructure to prevent changing API in future
release. Currently behaves the same O2.
## Troubleshooting
### Common Issues
1. **Startup Time Too Long**: Use `-O0` or `-O1` for faster startup
2. **Compilation Errors**: Use `debug_dump_path` for additional debugging information
3. **Performance Issues**: Ensure using `-O2` for production

View File

@ -29,6 +29,109 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all
By default, the cache saves compiled artifacts as binary files. If you would like to interact with the generated code for debugging purposes, set the field `compile_cache_save_format=unpacked` in the compilation config, or omit this and set the env variable `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`.
## Dynamic shapes and vllm guard dropping
`torch.compile` is designed to guard on dynamic shapes with no hesitation
when needed. This contradicts with vLLM's `torch.compile` approach of
dropping the guards since many of those guards could be material.
`torch.compile` provides two kinds of dynamic shapes: `backed` and `unbacked`.
`torch.compile` guards on `backed` dynamic shapes and does not provide a
guarantee that no guards will be added to them. User code, dynamo,
inductor, and autograd all can add guards. Moreover, for 0/1
specializations, backed symbols are specialized unconditionally to 0, 1,
or >=2 even without encountering a branching on those ranges.
On the contrary, `unbacked` dynamic shapes are guaranteed not to be guarded
on and are not 0/1 specialized. However, there is a possibility of
throwing a data dependent error when a branch that requires their value is
encountered and no explicit unbacked handling is defined. The framework is
converging to a state where it won't throw DDE but rather pick general
paths. One downside of using unbacked is missed optimization opportunities
due to either perf bugs or picking general paths, also using a fixed
non-example input-based hint (this will be fixed soon with override_hint
API). An example of picking general paths is assuming input not contiguous
in functions call contiguous() and reshape() when can't be symbolically proven
with a change of introducing a clone.
`backed_size_oblivious` is a flag that enables treating backed symbols as
unbacked wherever explicit handling for unbacked is defined. With this
mode, 0/1 specializations are mostly avoided in framework code and the
default 0/1 specialization does not happen. However, there is still no
guarantee that torch.compile won't guard, especially due to user code or
custom passes. `backed_size_oblivious` is experimental in PyTorch compile
and could be deprecated. That said, it's a safer option to use than
`backed` and the probability of reducing performance is lower than
`unbacked`.
### Configuring Dynamic Shapes
The `DynamicShapesConfig` allows you to control the dynamic shapes behavior by
setting the `type` field. You can choose between three modes:
`BACKED`(default), `UNBACKED` , and `BACKED_SIZE_OBLIVIOUS`.
#### Offline Inference Example (Using LLM class)
When using the `LLM` class for offline inference, you can configure dynamic
shapes through the `compilation_config` parameter:
```python
from vllm import LLM, SamplingParams
from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
# Example: Using backed_size_oblivious (experimental, safer than backed)
llm = LLM(
model="meta-llama/Llama-3.2-1B",
compilation_config=CompilationConfig(
dynamic_shapes_config=DynamicShapesConfig(
type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS
)
)
)
# Example: Using unbacked (strongest guarantee against guards)
llm = LLM(
model="meta-llama/Llama-3.2-1B",
compilation_config=CompilationConfig(
dynamic_shapes_config=DynamicShapesConfig(
type=DynamicShapesType.UNBACKED
)
)
)
# Generate outputs
prompts = ["Hello, my name is", "The future of AI is"]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
outputs = llm.generate(prompts, sampling_params)
```
#### Online Serving Example (Using vllm serve)
When using `vllm serve` for online serving, you can configure dynamic shapes
through the `--compilation-config` flag:
```bash
# Example: Using unbacked
vllm serve meta-llama/Llama-3.2-1B \
--compilation-config '{"dynamic_shapes_config": {"type": "unbacked"}}'
# Alternative: Using dot notation (simpler for single values)
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
```
#### Choosing the Right Mode
- **BACKED** (default): Use when you're willing to accept potential unsafe dropping of guards
for maximal performance. Guard could be unsoundly added and then ignored.
- **UNBACKED** Use when you need the strongest guarantee against guards.
This is the most conservative option but may miss some optimization opportunities.
- **BACKED_SIZE_OBLIVIOUS**: Use when you want a balance between avoiding guards
and performance. This experimental mode is safer than BACKED but still not as
conservative as UNBACKED.
## Python Code Compilation
In the very verbose logs, we can see:
@ -122,7 +225,7 @@ When all the shapes are known, `torch.compile` can compare different configs, an
triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
mm 0.0160 ms 81.6%
mm 0.0160 ms 81.6%
triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2

View File

@ -22,9 +22,6 @@ export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxab
vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
```
!!! tip
If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
!!! tip
When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
`VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.

View File

@ -7,7 +7,7 @@ This document shows you some examples of the different options that are
available to generate structured outputs.
!!! warning
If you are still using the following deprecated API fields, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
If you are still using the following deprecated API fields which were removed in v0.12.0, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
- `guided_json` -> `{"structured_outputs": {"json": ...}}` or `StructuredOutputsParams(json=...)`
- `guided_regex` -> `{"structured_outputs": {"regex": ...}}` or `StructuredOutputsParams(regex=...)`

View File

@ -283,7 +283,7 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options:
- On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
- On NVIDIA CUDA: `FLASH_ATTN` or `FLASHINFER`.
- On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.
For AMD ROCm, you can further control the specific Attention implementation using the following variables:

View File

@ -1,25 +1,33 @@
# CPU - Intel® Xeon®
## Validated Hardware
| Hardware |
| ----------------------------------------- |
| [Intel® Xeon® 6 Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon.html) |
| [Intel® Xeon® 5 Processors](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/5th-gen-xeon-scalable-processors.html) |
## Supported Models
### Text-only Language Models
| Model | Architecture | Supported |
|--------------------------------------|-------------------------------------------|-----------|
| meta-llama/Llama-3.1 / 3.3 | LlamaForCausalLM | ✅ |
| meta-llama/Llama-4-Scout | Llama4ForConditionalGeneration | ✅ |
| meta-llama/Llama-4-Maverick | Llama4ForConditionalGeneration | ✅ |
| ibm-granite/granite (Granite-MOE) | GraniteMoeForCausalLM | ✅ |
| Qwen/Qwen3 | Qwen3ForCausalLM | ✅ |
| zai-org/GLM-4.5 | GLMForCausalLM | ✅ |
| google/gemma | GemmaForCausalLM | ✅ |
| meta-llama/Llama-3.1-8B-Instruct | LlamaForCausalLM | ✅ |
| meta-llama/Llama-3.2-3B-Instruct | LlamaForCausalLM | ✅ |
| ibm-granite/granite-3.2-2b-instruct | GraniteForCausalLM | ✅ |
| Qwen/Qwen3-1.7B | Qwen3ForCausalLM | ✅ |
| Qwen/Qwen3-4B | Qwen3ForCausalLM | ✅ |
| Qwen/Qwen3-8B | Qwen3ForCausalLM | ✅ |
| zai-org/glm-4-9b-hf | GLMForCausalLM | ✅ |
| google/gemma-7b | GemmaForCausalLM | ✅ |
### Multimodal Language Models
| Model | Architecture | Supported |
|--------------------------------------|-------------------------------------------|-----------|
| Qwen/Qwen2.5-VL | Qwen2VLForConditionalGeneration | ✅ |
| openai/whisper | WhisperForConditionalGeneration | ✅ |
| Qwen/Qwen2.5-VL-7B-Instruct | Qwen2VLForConditionalGeneration | ✅ |
| openai/whisper-large-v3 | WhisperForConditionalGeneration | ✅ |
✅ Runs and optimized.
🟨 Runs and correct but not optimized to green yet.

View File

@ -0,0 +1,65 @@
# XPU - Intel® GPUs
## Validated Hardware
| Hardware |
| ----------------------------------------- |
| [Intel® Arc™ Pro B-Series Graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/workstations/b-series/overview.html) |
## Supported Models
### Text-only Language Models
| Model | Architecture | FP16 | Dynamic FP8 | MXFP4 |
| ----------------------------------------- | ---------------------------------------------------- | ---- | ----------- | ----- |
| openai/gpt-oss-20b | GPTForCausalLM | | | ✅ |
| openai/gpt-oss-120b | GPTForCausalLM | | | ✅ |
| deepseek-ai/DeepSeek-R1-Distill-Llama-8B | LlamaForCausalLM | ✅ | ✅ | |
| deepseek-ai/DeepSeek-R1-Distill-Qwen-14B | QwenForCausalLM | ✅ | ✅ | |
| deepseek-ai/DeepSeek-R1-Distill-Qwen-32B | QwenForCausalLM | ✅ | ✅ | |
| deepseek-ai/DeepSeek-R1-Distill-Llama-70B | LlamaForCausalLM | ✅ | ✅ | |
| Qwen/Qwen2.5-72B-Instruct | Qwen2ForCausalLM | ✅ | ✅ | |
| Qwen/Qwen3-14B | Qwen3ForCausalLM | ✅ | ✅ | |
| Qwen/Qwen3-32B | Qwen3ForCausalLM | ✅ | ✅ | |
| Qwen/Qwen3-30B-A3B | Qwen3ForCausalLM | ✅ | ✅ | |
| Qwen/Qwen3-30B-A3B-GPTQ-Int4 | Qwen3ForCausalLM | ✅ | ✅ | |
| Qwen/Qwen3-coder-30B-A3B-Instruct | Qwen3ForCausalLM | ✅ | ✅ | |
| Qwen/QwQ-32B | QwenForCausalLM | ✅ | ✅ | |
| deepseek-ai/DeepSeek-V2-Lite | DeepSeekForCausalLM | ✅ | ✅ | |
| meta-llama/Llama-3.1-8B-Instruct | LlamaForCausalLM | ✅ | ✅ | |
| baichuan-inc/Baichuan2-13B-Chat | BaichuanForCausalLM | ✅ | ✅ | |
| THUDM/GLM-4-9B-chat | GLMForCausalLM | ✅ | ✅ | |
| THUDM/CodeGeex4-All-9B | CodeGeexForCausalLM | ✅ | ✅ | |
| chuhac/TeleChat2-35B | LlamaForCausalLM (TeleChat2 based on Llama arch) | ✅ | ✅ | |
| 01-ai/Yi1.5-34B-Chat | YiForCausalLM | ✅ | ✅ | |
| THUDM/CodeGeex4-All-9B | CodeGeexForCausalLM | ✅ | ✅ | |
| deepseek-ai/DeepSeek-Coder-33B-base | DeepSeekCoderForCausalLM | ✅ | ✅ | |
| baichuan-inc/Baichuan2-13B-Chat | BaichuanForCausalLM | ✅ | ✅ | |
| meta-llama/Llama-2-13b-chat-hf | LlamaForCausalLM | ✅ | ✅ | |
| THUDM/CodeGeex4-All-9B | CodeGeexForCausalLM | ✅ | ✅ | |
| Qwen/Qwen1.5-14B-Chat | QwenForCausalLM | ✅ | ✅ | |
| Qwen/Qwen1.5-32B-Chat | QwenForCausalLM | ✅ | ✅ | |
### Multimodal Language Models
| Model | Architecture | FP16 | Dynamic FP8 | MXFP4 |
| ---------------------------- | -------------------------------- | ---- | ----------- | ----- |
| OpenGVLab/InternVL3_5-8B | InternVLForConditionalGeneration | ✅ | ✅ | |
| OpenGVLab/InternVL3_5-14B | InternVLForConditionalGeneration | ✅ | ✅ | |
| OpenGVLab/InternVL3_5-38B | InternVLForConditionalGeneration | ✅ | ✅ | |
| Qwen/Qwen2-VL-7B-Instruct | Qwen2VLForConditionalGeneration | ✅ | ✅ | |
| Qwen/Qwen2.5-VL-72B-Instruct | Qwen2VLForConditionalGeneration | ✅ | ✅ | |
| Qwen/Qwen2.5-VL-32B-Instruct | Qwen2VLForConditionalGeneration | ✅ | ✅ | |
| THUDM/GLM-4v-9B | GLM4vForConditionalGeneration | ✅ | ✅ | |
| openbmb/MiniCPM-V-4 | MiniCPMVForConditionalGeneration | ✅ | ✅ | |
### Embedding and Reranker Language Models
| Model | Architecture | FP16 | Dynamic FP8 | MXFP4 |
| ----------------------- | ------------------------------ | ---- | ----------- | ----- |
| Qwen/Qwen3-Embedding-8B | Qwen3ForTextEmbedding | ✅ | ✅ | |
| Qwen/Qwen3-Reranker-8B | Qwen3ForSequenceClassification | ✅ | ✅ | |
✅ Runs and optimized.
🟨 Runs and correct but not optimized to green yet.
❌ Does not pass accuracy test or does not run.

View File

@ -1,15 +1,15 @@
# Pooling Models
vLLM also supports pooling models, such as embedding, classification and reward models.
vLLM also supports pooling models, such as embedding, classification, and reward models.
In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input
before returning them.
!!! note
We currently support pooling models primarily as a matter of convenience. This is not guaranteed to have any performance improvement over using HF Transformers / Sentence Transformers directly.
We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly.
We are now planning to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
We plan to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
## Configuration
@ -19,7 +19,7 @@ Run a model in pooling mode via the option `--runner pooling`.
!!! tip
There is no need to set this option in the vast majority of cases as vLLM can automatically
detect the model runner to use via `--runner auto`.
detect the appropriate model runner via `--runner auto`.
### Model Conversion
@ -78,7 +78,7 @@ When loading [Sentence Transformers](https://huggingface.co/sentence-transformer
its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
You can further customize this via the `--pooler-config` option,
which takes priority over both the model's and Sentence Transformers's defaults.
which takes priority over both the model's and Sentence Transformers' defaults.
## Offline Inference
@ -168,11 +168,11 @@ The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
- For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`.
- For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`.
- For similarity scores, use `LLM.score(...)`.
- For similarity scores, use `LLM.score(...)`.
- For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`.
- For token classification, use `pooling_task="token_classify"`.
- For multi-vector retrieval, use `pooling_task="token_embed"`
- For IO Processor Plugins , use `pooling_task="plugin"`
- For multi-vector retrieval, use `pooling_task="token_embed"`.
- For IO Processor Plugins, use `pooling_task="plugin"`.
```python
from vllm import LLM
@ -194,15 +194,15 @@ Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides
- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
!!! note
Please use one of the more specific methods or set the task directly when using [Pooling API](../serving/openai_compatible_server.md#pooling-api) api.:
Please use one of the more specific endpoints or set the task directly when using the [Pooling API](../serving/openai_compatible_server.md#pooling-api):
- For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`.
- For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `task":"classify"`.
- For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).
- For rewards, `task":"token_classify"`.
- For token classification, use `task":"token_classify"`.
- For multi-vector retrieval, use `task":"token_embed"`
- For IO Processor Plugins , use `task":"plugin"`
- For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `"task":"classify"`.
- For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).
- For rewards, use `"task":"token_classify"`.
- For token classification, use `"task":"token_classify"`.
- For multi-vector retrieval, use `"task":"token_embed"`.
- For IO Processor Plugins, use `"task":"plugin"`.
```python
# start a supported embeddings model server with `vllm serve`, e.g.
@ -232,7 +232,7 @@ for output in response.json()["data"]:
## Matryoshka Embeddings
[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost.
[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows users to trade off between performance and cost.
!!! warning
Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
@ -245,9 +245,9 @@ for output in response.json()["data"]:
### Manually enable Matryoshka Embeddings
There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions.
There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json`, you can change the output dimension to arbitrary values. Use `matryoshka_dimensions` to control the allowed output dimensions.
For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf-overrides '{"is_matryoshka": true}'`, `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
For models that support Matryoshka Embeddings but are not recognized by vLLM, manually override the config using `hf_overrides={"is_matryoshka": True}` or `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline), or `--hf-overrides '{"is_matryoshka": true}'` or `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'` (online).
Here is an example to serve a model with Matryoshka Embeddings enabled.
@ -278,7 +278,7 @@ A code example can be found here: [examples/offline_inference/pooling/embed_matr
### Online Inference
Use the following command to start vllm server.
Use the following command to start the vLLM server.
```bash
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
@ -310,11 +310,11 @@ An OpenAI client example can be found here: [examples/online_serving/pooling/ope
### Encode task
We have split the `encode` task into two more specific token wise tasks: `token_embed` and `token_classify`:
We have split the `encode` task into two more specific token-wise tasks: `token_embed` and `token_classify`:
- `token_embed` is the same as embed, using normalize as activation.
- `token_classify` is the same as classify, default using softmax as activation.
- `token_embed` is the same as `embed`, using normalization as the activation.
- `token_classify` is the same as `classify`, by default using softmax as the activation.
### Remove softmax from PoolingParams
We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function.
We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.

View File

@ -680,6 +680,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ |
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |

View File

@ -49,7 +49,8 @@ We currently support the following OpenAI APIs:
- *Note: `suffix` parameter is not supported.*
- [Chat Completions API](#chat-api) (`/v1/chat/completions`)
- Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
- *Note: `parallel_tool_calls` and `user` parameters are ignored.*
- *Note: `user` parameter is ignored.*
- *Note:* Setting the `parallel_tool_calls` parameter to `false` ensures vLLM only returns zero or one tool call per request. Setting it to `true` (the default) allows returning more than one tool call per request. There is no guarantee more than one tool call will be returned if this is set to `true`, as that behavior is model dependent and not all models are designed to support parallel tool calls.
- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
- Only applicable to [embedding models](../models/pooling_models.md).
- [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)

15
examples/offline_inference/audio_language.py Normal file → Executable file
View File

@ -425,6 +425,13 @@ def parse_args():
default=None,
help="Set the seed when initializing `vllm.LLM`.",
)
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args()
@ -434,6 +441,12 @@ def main(args):
if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.")
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {args.tensor_parallel_size}"
)
audio_count = args.num_audios
req_data = model_example_map[model](
question_per_audio_count[audio_count], audio_count
@ -446,6 +459,8 @@ def main(args):
)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args)
# We set temperature to 0.2 so that outputs can be different

View File

@ -0,0 +1,170 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference
with the correct prompt format on Qwen2.5-Omni (thinker only).
"""
from typing import NamedTuple
from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.multimodal.image import convert_image_mode
from vllm.utils.argparse_utils import FlexibleArgumentParser
class QueryResult(NamedTuple):
inputs: dict
limit_mm_per_prompt: dict[str, int]
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
default_system = (
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
"Group, capable of perceiving auditory and visual inputs, as well as "
"generating text and speech."
)
def get_mixed_modalities_query() -> QueryResult:
question = (
"What is recited in the audio? "
"What is the content of this image? Why is this video funny?"
)
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
"<|vision_start|><|image_pad|><|vision_end|>"
"<|vision_start|><|video_pad|><|vision_end|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
"image": convert_image_mode(
ImageAsset("cherry_blossom").pil_image, "RGB"
),
"video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
},
},
limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1},
)
def get_use_audio_in_video_query() -> QueryResult:
question = (
"Describe the content of the video in details, then convert what the "
"baby say into text."
)
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
asset = VideoAsset(name="baby_reading", num_frames=16)
audio = asset.get_audio(sampling_rate=16000)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"video": asset.np_ndarrays,
"audio": audio,
},
"mm_processor_kwargs": {
"use_audio_in_video": True,
},
},
limit_mm_per_prompt={"audio": 1, "video": 1},
)
def get_multi_audios_query() -> QueryResult:
question = "Are these two audio clips the same?"
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
"<|audio_start|><|audio_pad|><|audio_end|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"audio": [
AudioAsset("winning_call").audio_and_sample_rate,
AudioAsset("mary_had_lamb").audio_and_sample_rate,
],
},
},
limit_mm_per_prompt={
"audio": 2,
},
)
query_map = {
"mixed_modalities": get_mixed_modalities_query,
"use_audio_in_video": get_use_audio_in_video_query,
"multi_audios": get_multi_audios_query,
}
def main(args):
model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
query_result = query_map[args.query_type]()
llm = LLM(
model=model_name,
max_model_len=12800,
max_num_seqs=5,
limit_mm_per_prompt=query_result.limit_mm_per_prompt,
seed=args.seed,
)
# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2, max_tokens=256)
outputs = llm.generate(query_result.inputs, sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def parse_args():
parser = FlexibleArgumentParser(
description="Demo on using vLLM for offline inference with "
"audio language models"
)
parser.add_argument(
"--query-type",
"-q",
type=str,
default="mixed_modalities",
choices=query_map.keys(),
help="Query type.",
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@ -133,7 +133,7 @@ def main(args):
tensor_parallel_size=args.tp,
enable_chunked_prefill=args.enable_chunked_prefill,
enforce_eager=args.enforce_eager,
gpu_memory_utilization=0.8,
gpu_memory_utilization=0.9,
speculative_config=speculative_config,
disable_log_stats=False,
max_model_len=args.max_model_len,

41
examples/offline_inference/vision_language.py Normal file → Executable file
View File

@ -538,6 +538,31 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
)
# HunyuanOCR
def run_hunyuan_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "tencent/HunyuanOCR"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
limit_mm_per_prompt={modality: 1},
)
placeholder = "<hy_place▁holder▁no▁100><hy_place▁holder▁no▁102><hy_place▁holder▁no▁101>" # noqa: E501
prompts = [
f"<hy_begin▁of▁sentence>{placeholder}{question}<hy_User>"
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=None,
)
# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
def run_hyperclovax_seed_vision(
questions: list[str], modality: str
@ -1820,6 +1845,7 @@ model_example_map = {
"glm4_5v": run_glm4_5v,
"glm4_5v_fp8": run_glm4_5v_fp8,
"h2ovl_chat": run_h2ovl,
"hunyuan_vl": run_hunyuan_vl,
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
"idefics3": run_idefics3,
"interns1": run_interns1,
@ -2038,6 +2064,13 @@ def parse_args():
help="If True, will send all requests in a second batch with empty mm "
"data to verify cache hits with UUIDs.",
)
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args()
@ -2046,6 +2079,12 @@ def main(args):
if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.")
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {args.tensor_parallel_size}"
)
modality = args.modality
mm_input = get_multi_modal_input(args)
data = mm_input["data"]
@ -2063,6 +2102,8 @@ def main(args):
"seed": args.seed,
"mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
}
if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args)
# Don't want to check the flag multiple times, so just hijack `prompts`.

View File

@ -1110,6 +1110,7 @@ def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model=model_name,
max_model_len=16384,
max_num_seqs=16,
trust_remote_code=True,
limit_mm_per_prompt={"image": len(image_urls)},
)
@ -1351,10 +1352,18 @@ model_example_map = {
}
def run_generate(model, question: str, image_urls: list[str], seed: int | None):
def run_generate(
model,
question: str,
image_urls: list[str],
seed: int | None,
tensor_parallel_size: int | None,
):
req_data = model_example_map[model](question, image_urls)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
engine_args = asdict(req_data.engine_args) | {"seed": seed}
if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size
llm = LLM(**engine_args)
sampling_params = SamplingParams(
@ -1377,7 +1386,13 @@ def run_generate(model, question: str, image_urls: list[str], seed: int | None):
print("-" * 50)
def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
def run_chat(
model: str,
question: str,
image_urls: list[str],
seed: int | None,
tensor_parallel_size: int | None,
):
req_data = model_example_map[model](question, image_urls)
# Disable other modalities to save memory
@ -1387,6 +1402,8 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None)
)
engine_args = asdict(req_data.engine_args) | {"seed": seed}
if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size
llm = LLM(**engine_args)
sampling_params = (
@ -1462,6 +1479,13 @@ def parse_args():
default=2,
help="Number of images to use for the demo.",
)
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args()
@ -1469,13 +1493,20 @@ def main(args: Namespace):
model = args.model_type
method = args.method
seed = args.seed
tensor_parallel_size = args.tensor_parallel_size
if tensor_parallel_size is not None and tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {tensor_parallel_size}"
)
image_urls = IMAGE_URLS[: args.num_images]
if method == "generate":
run_generate(model, QUESTION, image_urls, seed)
run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size)
elif method == "chat":
run_chat(model, QUESTION, image_urls, seed)
run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size)
else:
raise ValueError(f"Invalid method: {method}")

View File

@ -22,7 +22,6 @@ API_KEY=${API_KEY:-"your-api-key"}
POOLING_TYPE=${POOLING_TYPE:-"auto"} # auto, MEAN, CLS, LAST
export VLLM_ENABLE_CHUNKED_PROCESSING=true
export CUDA_VISIBLE_DEVICES=2,3,4,5
# export VLLM_ATTENTION_BACKEND=XFORMERS
echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing"
echo "=================================================================="

View File

@ -4,9 +4,9 @@ packaging>=24.2
setuptools>=77.0.3,<81.0.0
setuptools-scm>=8
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.9.0; platform_system == "Darwin"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
scons; platform_machine == "aarch64" # needed to build Arm Compute Library (ACL)
wheel
jinja2>=3.1.6

View File

@ -7,17 +7,17 @@ numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative d
packaging>=24.2
setuptools>=77.0.3,<81.0.0
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.9.0; platform_system == "Darwin"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
torchaudio==2.8.0; platform_machine == "ppc64le"
torchaudio==2.9.0; platform_machine == "ppc64le"
# required for the image processor of phi3v, this must be updated alongside torch
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
torchvision==0.23.0; platform_machine == "ppc64le"
torchvision==0.24.0; platform_machine == "ppc64le"
datasets # for benchmark scripts
# Intel Extension for PyTorch, only for x86_64 CPUs

View File

@ -9,6 +9,5 @@ torch==2.9.0
torchaudio==2.9.0
# These must be updated alongside torch
torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9
# FlashInfer should be updated together with the Dockerfile
flashinfer-python==0.5.2
flashinfer-python==0.5.3

View File

@ -1,2 +1,2 @@
lmcache
nixl >= 0.6.0 # Required for disaggregated prefill
nixl >= 0.7.1 # Required for disaggregated prefill

View File

@ -29,7 +29,7 @@ opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
transformers==4.57.1
transformers==4.57.3
tokenizers==0.22.0
schemathesis>=3.39.15 # Required for openai schema test.
# quantization

View File

@ -45,3 +45,7 @@ multiprocess==0.70.16
# Plugins test
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
torchgeo==0.7.0
# Required for suffix decoding test
arctic-inference == 0.1.1

View File

@ -37,7 +37,7 @@ datamodel_code_generator # required for minicpm3 test
# TODO: Use lm-eval[api]==0.4.10 once released
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.1
transformers==4.57.3
tokenizers==0.22.0
schemathesis>=3.39.15 # Required for openai schema test.
# quantization

View File

@ -1196,7 +1196,7 @@ tqdm==4.66.6
# transformers
tqdm-multiprocess==0.0.11
# via lm-eval
transformers==4.57.1
transformers==4.57.3
# via
# -r requirements/test.in
# genai-perf

View File

@ -12,6 +12,4 @@ ray[data]
setuptools==78.1.0
nixl==0.3.0
tpu_info==0.4.0
# Install torch_xla
torch_xla[tpu, pallas]==2.8.0
tpu-inference==0.11.1

View File

@ -10,9 +10,9 @@ wheel
jinja2>=3.1.6
datasets # for benchmark scripts
numba == 0.61.2 # Required for N-gram speculative decoding
torch==2.8.0+xpu
--extra-index-url=https://download.pytorch.org/whl/xpu
torch==2.9.0+xpu
torchaudio
torchvision
--extra-index-url=https://download.pytorch.org/whl/xpu
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.9.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl

View File

@ -74,9 +74,6 @@ def test_models(
model_executor: str,
enable_prompt_embeds: bool,
) -> None:
if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
pytest.skip(f"{backend} does not support gemma2 with full context length.")
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", backend)

View File

@ -55,7 +55,7 @@ class SillyModel(nn.Module):
def _run_simple_model(
splitting_ops,
use_inductor_graph_partition,
use_inductor,
backend,
expected_num_piecewise_graphs_seen,
expected_num_piecewise_capturable_graphs_seen,
expected_num_backend_compilations,
@ -64,7 +64,7 @@ def _run_simple_model(
vllm_config = VllmConfig(
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
use_inductor=use_inductor,
backend=backend,
splitting_ops=splitting_ops,
use_inductor_graph_partition=use_inductor_graph_partition,
cudagraph_copy_inputs=True,
@ -124,14 +124,14 @@ def _run_simple_model(
assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
@pytest.mark.parametrize("use_inductor", [True, False])
@pytest.mark.parametrize("backend", ["inductor", "eager"])
@torch.inference_mode()
@create_new_process_for_each_test("spawn")
def test_simple_piecewise_compile(use_inductor):
def test_simple_piecewise_compile(backend):
_run_simple_model(
splitting_ops=["silly::attention"],
use_inductor_graph_partition=False,
use_inductor=use_inductor,
backend=backend,
# 2 * num_layers + 1
expected_num_piecewise_graphs_seen=5,
# 1 + num_layers
@ -155,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch):
_run_simple_model(
splitting_ops=["silly::attention"],
use_inductor_graph_partition=True,
use_inductor=True,
backend="inductor",
# Since not splitting at fx graph level
expected_num_piecewise_graphs_seen=1,
# Since not splitting at fx graph level

View File

@ -172,8 +172,8 @@ def test_splitting_ops_dynamic():
config = VllmConfig()
# Default V1 config leaves cudagraph mode unset; splitting ops are only
# populated when the engine decides to use piecewise compilation.
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
assert not config.compilation_config.splitting_ops_contain_attention()
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
assert config.compilation_config.splitting_ops_contain_attention()
# When use_inductor_graph_partition=True
config = VllmConfig(

View File

@ -0,0 +1,88 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
import pytest
import torch
from vllm import LLM, SamplingParams
from vllm.config.compilation import CompilationMode, DynamicShapesType
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.utils.torch_utils import is_torch_equal_or_newer
def get_test_models():
"""Get list of models to test based on PyTorch version"""
# TODO "Qwen/Qwen3-4B-Instruct-2507" fails Fix issue and support it.
return ["gpt2", "Qwen/Qwen2-7B-Instruct", "meta-llama/Llama-3.1-8B"]
@pytest.mark.parametrize("model_name", get_test_models())
@pytest.mark.parametrize(
"shapes_type",
[
DynamicShapesType.BACKED,
DynamicShapesType.UNBACKED,
DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
],
)
@pytest.mark.parametrize("use_aot_compile", ["0"])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
@pytest.mark.skipif(
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
)
def test_dynamic_shapes_compilation(
monkeypatch, model_name, shapes_type, use_aot_compile, use_bytecode_hook
):
"""Test that all dynamic shapes types compile successfully"""
print(
f"\nTesting model: {model_name} with {shapes_type.name}, "
f"AOT compile: {use_aot_compile}, "
f"Bytecode hook: {use_bytecode_hook}"
)
if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
prompt = "Hello, my name is"
print(f"Testing {shapes_type.name} dynamic shapes...")
# Initialize the model with specific dynamic shapes configuration
model = LLM(
model=model_name,
compilation_config={
"mode": CompilationMode.VLLM_COMPILE,
"dynamic_shapes_config": {
"type": shapes_type.value,
},
},
)
output = model.generate(prompt)
result = output[0].outputs[0].text
# Example of setting the sampling parameters
tokenizer = get_tokenizer(model_name)
yes_tokens = tokenizer.encode("yes", add_special_tokens=False)
no_tokens = tokenizer.encode("no", add_special_tokens=False)
allowed_ids = list(set(yes_tokens + no_tokens))
sampling_params = SamplingParams(
max_tokens=1, temperature=0, allowed_token_ids=allowed_ids
)
output = model.generate(
"answer with yes or no is " + result + " rubbish for prompt " + prompt + "?",
sampling_params=sampling_params,
)
result = output[0].outputs[0].text
assert result == "yes"
# Clean up GPU memory
del model
gc.collect()
torch.cuda.empty_cache()
torch.cuda.synchronize()
print("GPU memory cleared")

View File

@ -9,8 +9,9 @@ from tests.compile.backend import LazyInitPass, TestBackend
from tests.utils import flat_product
from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
from vllm.attention import Attention, AttentionMetadata
from vllm.attention.backends.abstract import AttentionMetadata
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.layer import Attention
from vllm.attention.selector import global_force_attn_backend_context_manager
from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
from vllm.compilation.fx_utils import find_op_nodes

View File

@ -5,7 +5,8 @@ import pytest
import torch
from tests.compile.backend import TestBackend
from vllm.attention import Attention, AttentionType
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.compilation.post_cleanup import PostCleanupPass

View File

@ -1,13 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import random
import pytest
import torch
import torch.distributed
from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
from vllm.distributed.eplb.rebalance_execute import (
move_from_buffer,
rearrange_expert_weights_inplace,
transfer_layer,
)
from vllm.distributed.parallel_state import (
ensure_model_parallel_initialized,
get_tp_group,
@ -231,6 +236,100 @@ def verify_redundant_experts_have_same_weights(
)
def _test_async_transfer_layer_without_mtp_worker(
env,
world_size: int,
num_layers: int,
num_local_experts: int,
num_logical_experts: int,
) -> None:
set_env_vars_and_device(env)
ensure_model_parallel_initialized(
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
)
tp_group = get_tp_group()
ep_group = tp_group.device_group
ep_rank = torch.distributed.get_rank()
device = torch.device(f"cuda:{ep_rank}")
total_physical_experts = world_size * num_local_experts
hidden_sizes = [16, 32]
redundancy_config = create_redundancy_config(
num_logical_experts,
total_physical_experts,
)
old_indices = create_expert_indices_with_redundancy(
num_layers,
num_logical_experts,
total_physical_experts,
redundancy_config,
)
new_redundancy_config = create_redundancy_config(
num_logical_experts,
total_physical_experts,
)
new_indices = create_expert_indices_with_redundancy(
num_layers,
num_logical_experts,
total_physical_experts,
new_redundancy_config,
)
expert_weights = create_expert_weights(
num_layers,
num_local_experts,
hidden_sizes,
ep_rank,
device,
old_indices,
)
expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
cuda_stream = torch.cuda.Stream(device=device)
for layer_idx in range(num_layers):
is_unchanged, is_received_locally, experts_recv_loc = asyncio.run(
transfer_layer(
old_global_expert_indices=old_indices,
new_global_expert_indices=new_indices,
expert_weights=expert_weights,
expert_weights_buffer=expert_buffer,
ep_group=ep_group,
layer=layer_idx,
cuda_stream=cuda_stream,
)
)
cuda_stream.synchronize()
move_from_buffer(
expert_weights=expert_weights[layer_idx],
expert_weights_buffer=expert_buffer,
is_unchanged=is_unchanged,
is_received_locally=is_received_locally,
experts_recv_loc=experts_recv_loc,
new_indices=new_indices[layer_idx].tolist(),
ep_group=ep_group,
)
verify_expert_weights_after_shuffle(
expert_weights,
new_indices,
hidden_sizes,
ep_rank,
num_local_experts,
)
verify_redundant_experts_have_same_weights(
expert_weights,
new_indices,
hidden_sizes,
world_size,
num_local_experts,
)
def _test_rearrange_expert_weights_with_redundancy(
env, world_size, num_layers, num_local_experts, num_logical_experts
) -> None:
@ -399,6 +498,32 @@ def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
)
@pytest.mark.parametrize(
"world_size,num_layers,num_local_experts,num_logical_experts",
[
(2, 2, 2, 3),
],
)
def test_async_transfer_layer_without_mtp(
world_size: int,
num_layers: int,
num_local_experts: int,
num_logical_experts: int,
):
"""Exercise async EPLB transfer path without MTP/spec decode."""
if torch.cuda.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(
_test_async_transfer_layer_without_mtp_worker,
world_size,
num_layers,
num_local_experts,
num_logical_experts,
)
@pytest.mark.parametrize("world_size", [2, 4])
def test_rearrange_expert_weights_no_change(world_size):
"""

View File

@ -10,10 +10,11 @@ from tests.utils import large_gpu_mark
def get_model_args(
model_name: str,
spec_model_name: str,
spec_model_name: str | None,
spec_method: str,
tp_size: int,
model_max_len: int,
use_async: bool = False,
) -> dict:
speculative_config = {
"method": spec_method,
@ -37,6 +38,8 @@ def get_model_args(
"enable_eplb": True,
"max_model_len": model_max_len,
}
if use_async:
model_args["eplb_config"] = {"use_async": True}
return model_args
@ -94,3 +97,37 @@ def test_eplb_spec_decode(
measured_value - RTOL < expected_gsm8k_value
and measured_value + RTOL > expected_gsm8k_value
), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}"
@large_gpu_mark(min_gb=80)
def test_eplb_spec_decode_qwen3_next_mtp_async() -> None:
"""
Ensure async EPLB works with MTP speculative decoding for Qwen3-Next.
"""
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
expected_gsm8k_value = 0.86
model_args = get_model_args(
model_name="Qwen/Qwen3-Next-80B-A3B-Instruct",
spec_model_name=None,
spec_method="mtp",
tp_size=4,
model_max_len=4096,
use_async=True,
)
results = lm_eval.simple_evaluate(
model="vllm",
model_args=model_args,
tasks=TASK,
batch_size=64,
num_fewshot=8,
)
measured_value = results["results"][TASK][FILTER]
assert (
measured_value - RTOL < expected_gsm8k_value
and measured_value + RTOL > expected_gsm8k_value
), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}"

View File

@ -222,6 +222,47 @@ def test_media_io_kwargs_parser(arg, expected):
assert args.media_io_kwargs == expected
@pytest.mark.parametrize(
("args", "expected"),
[
(["-O", "1"], "1"),
(["-O", "2"], "2"),
(["-O", "3"], "3"),
(["-O0"], "0"),
(["-O1"], "1"),
(["-O2"], "2"),
(["-O3"], "3"),
],
)
def test_optimization_level(args, expected):
"""
Test space-separated optimization levels (-O 1, -O 2, -O 3) map to
optimization_level.
"""
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
parsed_args = parser.parse_args(args)
assert parsed_args.optimization_level == expected
assert parsed_args.compilation_config.mode is None
@pytest.mark.parametrize(
("args", "expected"),
[
(["-O.mode=0"], 0),
(["-O.mode=1"], 1),
(["-O.mode=2"], 2),
(["-O.mode=3"], 3),
],
)
def test_mode_parser(args, expected):
"""
Test compilation config modes (-O.mode=int) map to compilation_config.
"""
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
parsed_args = parser.parse_args(args)
assert parsed_args.compilation_config.mode == expected
def test_compilation_config():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
@ -229,34 +270,17 @@ def test_compilation_config():
args = parser.parse_args([])
assert args.compilation_config == CompilationConfig()
# set to O3
args = parser.parse_args(["-O0"])
assert args.compilation_config.mode == 0
# set to O 3 (space)
args = parser.parse_args(["-O", "1"])
assert args.compilation_config.mode == 1
# set to O 3 (equals)
args = parser.parse_args(["-O=2"])
assert args.compilation_config.mode == 2
# set to O.mode 3
args = parser.parse_args(["-O.mode", "3"])
assert args.compilation_config.mode == 3
# set to string form of a dict
args = parser.parse_args(
[
"-O",
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
'"use_inductor": false}',
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
]
)
assert (
args.compilation_config.mode == 3
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
and not args.compilation_config.use_inductor
and args.compilation_config.backend == "eager"
)
# set to string form of a dict
@ -264,13 +288,13 @@ def test_compilation_config():
[
"--compilation-config="
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
'"use_inductor": true}',
'"backend": "inductor"}',
]
)
assert (
args.compilation_config.mode == 3
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
and args.compilation_config.use_inductor
and args.compilation_config.backend == "inductor"
)
@ -278,8 +302,9 @@ def test_prefix_cache_default():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
args = parser.parse_args([])
# should be None by default (depends on model).
engine_args = EngineArgs.from_cli_args(args=args)
assert engine_args.enable_prefix_caching, "prefix caching should default to on."
assert engine_args.enable_prefix_caching is None
# with flag to turn it on.
args = parser.parse_args(["--enable-prefix-caching"])

View File

@ -183,9 +183,6 @@ async def test_metrics_counts(
EXPECTED_METRICS_V1 = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc",
"vllm:gpu_prefix_cache_queries",
"vllm:gpu_prefix_cache_hits",
"vllm:kv_cache_usage_perc",
"vllm:prefix_cache_queries",
"vllm:prefix_cache_hits",

View File

@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import json
import time
@ -35,6 +35,10 @@ GET_WEATHER_SCHEMA = {
@pytest.fixture(scope="module")
def server():
assert importlib.util.find_spec("gpt_oss") is not None, (
"Harmony tests require gpt_oss package to be installed"
)
args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
env_dict = dict(
VLLM_ENABLE_RESPONSES_API_STORE="1",

View File

@ -2,20 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# imports for structured outputs tests
import io
import json
import librosa
import numpy as np
import openai
import pytest
import pytest_asyncio
import soundfile as sf
from ...utils import RemoteOpenAIServer
MODEL_NAME = "openai/whisper-large-v3-turbo"
SERVER_ARGS = ["--enforce-eager"]
MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode",
"mistral",
@ -26,22 +18,8 @@ MISTRAL_FORMAT_ARGS = [
]
@pytest.fixture(scope="module")
def server():
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name", ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"]
)
@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
async def test_basic_audio(mary_had_lamb, model_name):
server_args = ["--enforce-eager"]
@ -120,176 +98,3 @@ async def test_basic_audio_gemma(foscolo):
)
out = json.loads(transcription)["text"]
assert "da cui vergine nacque Venere" in out
@pytest.mark.asyncio
async def test_non_asr_model(winning_call):
# text to text model
model_name = "JackFram/llama-68m"
with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
client = remote_server.get_async_client()
res = await client.audio.transcriptions.create(
model=model_name, file=winning_call, language="en", temperature=0.0
)
err = res.error
assert err["code"] == 400 and not res.text
assert err["message"] == "The model does not support Transcriptions API"
@pytest.mark.asyncio
async def test_bad_requests(mary_had_lamb, client):
# invalid language
with pytest.raises(openai.BadRequestError):
await client.audio.transcriptions.create(
model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
)
@pytest.mark.asyncio
async def test_long_audio_request(mary_had_lamb, client):
mary_had_lamb.seek(0)
audio, sr = librosa.load(mary_had_lamb)
# Add small silence after each audio for repeatability in the split process
audio = np.pad(audio, (0, 1600))
repeated_audio = np.tile(audio, 10)
# Repeated audio to buffer
buffer = io.BytesIO()
sf.write(buffer, repeated_audio, sr, format="WAV")
buffer.seek(0)
transcription = await client.audio.transcriptions.create(
model=MODEL_NAME,
file=buffer,
language="en",
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)
out_text = out["text"]
out_usage = out["usage"]
counts = out_text.count("Mary had a little lamb")
assert counts == 10, counts
assert out_usage["seconds"] == 161, out_usage["seconds"]
@pytest.mark.asyncio
async def test_completion_endpoints(client):
# text to text model
res = await client.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "system", "content": "You are a helpful assistant."}],
)
err = res.error
assert err["code"] == 400
assert err["message"] == "The model does not support Chat Completions API"
res = await client.completions.create(model=MODEL_NAME, prompt="Hello")
err = res.error
assert err["code"] == 400
assert err["message"] == "The model does not support Completions API"
@pytest.mark.asyncio
async def test_streaming_response(winning_call, client):
transcription = ""
res_no_stream = await client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
response_format="json",
language="en",
temperature=0.0,
)
res = await client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
language="en",
temperature=0.0,
stream=True,
timeout=30,
)
# Reconstruct from chunks and validate
async for chunk in res:
text = chunk.choices[0]["delta"]["content"]
transcription += text
assert transcription == res_no_stream.text
@pytest.mark.asyncio
async def test_stream_options(winning_call, client):
res = await client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
language="en",
temperature=0.0,
stream=True,
extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
timeout=30,
)
final = False
continuous = True
async for chunk in res:
if not len(chunk.choices):
# final usage sent
final = True
else:
continuous = continuous and hasattr(chunk, "usage")
assert final and continuous
@pytest.mark.asyncio
async def test_sampling_params(mary_had_lamb, client):
"""
Compare sampling with params and greedy sampling to assert results
are different when extreme sampling parameters values are picked.
"""
transcription = await client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
temperature=0.8,
extra_body=dict(
seed=42,
repetition_penalty=1.9,
top_k=12,
top_p=0.4,
min_p=0.5,
frequency_penalty=1.8,
presence_penalty=2.0,
),
)
greedy_transcription = await client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
temperature=0.0,
extra_body=dict(seed=42),
)
assert greedy_transcription.text != transcription.text
@pytest.mark.asyncio
async def test_audio_prompt(mary_had_lamb, client):
prompt = "This is a speech, recorded in a phonograph."
# Prompts should not omit the part of original prompt while transcribing.
prefix = "The first words I spoke in the original phonograph"
transcription = await client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)["text"]
assert prefix in out
transcription_wprompt = await client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
prompt=prompt,
temperature=0.0,
)
out_prompt = json.loads(transcription_wprompt)["text"]
assert prefix in out_prompt

View File

@ -0,0 +1,237 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# imports for structured outputs tests
import asyncio
import io
import json
import librosa
import numpy as np
import openai
import pytest
import pytest_asyncio
import soundfile as sf
from ...utils import RemoteOpenAIServer
MODEL_NAME = "openai/whisper-large-v3-turbo"
SERVER_ARGS = ["--enforce-eager"]
@pytest.fixture(scope="module")
def server():
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def whisper_client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_basic_audio(mary_had_lamb):
server_args = ["--enforce-eager"]
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(MODEL_NAME, server_args) as remote_server:
client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)
out_text = out["text"]
out_usage = out["usage"]
assert "Mary had a little lamb," in out_text
assert out_usage["seconds"] == 16, out_usage["seconds"]
@pytest.mark.asyncio
async def test_basic_audio_batched(mary_had_lamb, winning_call, whisper_client):
transcription = whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
)
transcription2 = whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
language="en",
response_format="text",
temperature=0.0,
)
# Await both transcriptions by scheduling coroutines together
transcription, transcription2 = await asyncio.gather(transcription, transcription2)
out = json.loads(transcription)
out_text = out["text"]
assert "Mary had a little lamb," in out_text
out2 = json.loads(transcription2)
out_text2 = out2["text"]
assert "Edgar Martinez" in out_text2
@pytest.mark.asyncio
async def test_bad_requests(mary_had_lamb, whisper_client):
# invalid language
with pytest.raises(openai.BadRequestError):
await whisper_client.audio.transcriptions.create(
model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
)
@pytest.mark.asyncio
async def test_long_audio_request(mary_had_lamb, whisper_client):
mary_had_lamb.seek(0)
audio, sr = librosa.load(mary_had_lamb)
# Add small silence after each audio for repeatability in the split process
audio = np.pad(audio, (0, 1600))
repeated_audio = np.tile(audio, 10)
# Repeated audio to buffer
buffer = io.BytesIO()
sf.write(buffer, repeated_audio, sr, format="WAV")
buffer.seek(0)
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=buffer,
language="en",
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)
out_text = out["text"]
out_usage = out["usage"]
counts = out_text.count("Mary had a little lamb")
assert counts == 10, counts
assert out_usage["seconds"] == 161, out_usage["seconds"]
@pytest.mark.asyncio
async def test_completion_endpoints(whisper_client):
# text to text model
res = await whisper_client.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "system", "content": "You are a helpful assistant."}],
)
err = res.error
assert err["code"] == 400
assert err["message"] == "The model does not support Chat Completions API"
res = await whisper_client.completions.create(model=MODEL_NAME, prompt="Hello")
err = res.error
assert err["code"] == 400
assert err["message"] == "The model does not support Completions API"
@pytest.mark.asyncio
async def test_streaming_response(winning_call, whisper_client):
transcription = ""
res_no_stream = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
response_format="json",
language="en",
temperature=0.0,
)
res = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
language="en",
temperature=0.0,
stream=True,
timeout=30,
)
# Reconstruct from chunks and validate
async for chunk in res:
text = chunk.choices[0]["delta"]["content"]
transcription += text
assert transcription == res_no_stream.text
@pytest.mark.asyncio
async def test_stream_options(winning_call, whisper_client):
res = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
language="en",
temperature=0.0,
stream=True,
extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
timeout=30,
)
final = False
continuous = True
async for chunk in res:
if not len(chunk.choices):
# final usage sent
final = True
else:
continuous = continuous and hasattr(chunk, "usage")
assert final and continuous
@pytest.mark.asyncio
async def test_sampling_params(mary_had_lamb, whisper_client):
"""
Compare sampling with params and greedy sampling to assert results
are different when extreme sampling parameters values are picked.
"""
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
temperature=0.8,
extra_body=dict(
seed=42,
repetition_penalty=1.9,
top_k=12,
top_p=0.4,
min_p=0.5,
frequency_penalty=1.8,
presence_penalty=2.0,
),
)
greedy_transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
temperature=0.0,
extra_body=dict(seed=42),
)
assert greedy_transcription.text != transcription.text
@pytest.mark.asyncio
async def test_audio_prompt(mary_had_lamb, whisper_client):
prompt = "This is a speech, recorded in a phonograph."
# Prompts should not omit the part of original prompt while transcribing.
prefix = "The first words I spoke in the original phonograph"
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)["text"]
assert prefix in out
transcription_wprompt = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
prompt=prompt,
temperature=0.0,
)
out_prompt = json.loads(transcription_wprompt)["text"]
assert prefix in out_prompt

View File

@ -1,6 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import MagicMock, patch
import pytest
from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
@ -132,3 +134,129 @@ def test_extract_tool_calls_multiple_json_with_surrounding_text(parser):
assert result.tool_calls[0].function.name == "searchTool"
assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
assert result.tool_calls[2].function.name == "searchTool"
def test_extract_tool_calls_deeply_nested_json(parser):
# Test with deeply nested JSON parameters (5 levels)
model_output = (
'{"name": "complexTool", '
'"parameters": {'
'"level1": {'
'"level2": {'
'"level3": {'
'"level4": {'
'"value": "deep"'
"}}}}}}"
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 1
assert result.tool_calls[0].function.name == "complexTool"
# Verify the nested structure is preserved in the arguments
import json
args = json.loads(result.tool_calls[0].function.arguments)
assert args["level1"]["level2"]["level3"]["level4"]["value"] == "deep"
def test_extract_tool_calls_multiple_with_deep_nesting(parser):
# Test with multiple tool calls where some have deeply nested parameters
model_output = (
'{"name": "simpleTool", "parameters": {"value": "test"}}; '
'{"name": "complexTool", "parameters": '
'{"config": {"database": {"connection": {"pool": {"size": 10}}}}}}'
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 2
# Check first tool call
assert result.tool_calls[0].function.name == "simpleTool"
import json
args0 = json.loads(result.tool_calls[0].function.arguments)
assert args0["value"] == "test"
# Check second tool call with deep nesting
assert result.tool_calls[1].function.name == "complexTool"
args1 = json.loads(result.tool_calls[1].function.arguments)
assert args1["config"]["database"]["connection"]["pool"]["size"] == 10
def test_extract_tool_calls_with_quotes_and_brackets_in_string(parser):
# Test with quotes and brackets inside quoted string values
model_output = (
'{"name": "searchTool", '
'"parameters": {'
'"query": "test {value} [complex]",'
'"nested": {"inner": "more {brackets}"}'
"}}"
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 1
assert result.tool_calls[0].function.name == "searchTool"
# Verify the string values are preserved including brackets and quotes
import json
args = json.loads(result.tool_calls[0].function.arguments)
assert args["query"] == "test {value} [complex]"
assert args["nested"]["inner"] == "more {brackets}"
def test_extract_tool_calls_with_escaped_quotes_in_nested_json(parser):
# Test with escaped quotes in deeply nested JSON
model_output = (
'{"name": "parserTool", "parameters": {"text": "He said \\"Hello {world}\\""}}'
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 1
assert result.tool_calls[0].function.name == "parserTool"
# Verify escaped quotes are preserved
import json
args = json.loads(result.tool_calls[0].function.arguments)
assert args["text"] == 'He said "Hello {world}"'
def test_extract_tool_calls_missing_name_key(parser):
# Test that missing "name" key returns content
model_output = '{"parameters": {}}'
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is False
assert len(result.tool_calls) == 0
assert result.content == model_output
def test_extract_tool_calls_missing_parameters_and_arguments_key(parser):
# Test that missing both "parameters" and "arguments" keys returns content
model_output = '{"name": "toolWithoutParams"}'
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is False
assert len(result.tool_calls) == 0
assert result.content == model_output
def test_regex_timeout_handling(parser):
"""Test regex timeout is handled gracefully"""
fake_problematic_input = "{hello world[A(A=" + "\t)A(A=,\t" * 2
# create a mock regex that raises TimeoutError
mock_regex = MagicMock()
mock_regex.finditer.side_effect = TimeoutError("Regex timeout")
with patch.object(parser, "tool_call_start_regex", mock_regex):
result = parser.extract_tool_calls(fake_problematic_input, None)
# should treat as regular text when regex times out
assert result.content == fake_problematic_input
assert result.tools_called is False
assert len(result.tool_calls) == 0
mock_regex.finditer.assert_called_once()

View File

@ -2,6 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from openai.types.responses.response_function_tool_call_output_item import (
ResponseFunctionToolCallOutputItem,
)
from openai.types.responses.response_reasoning_item import (
Content,
ResponseReasoningItem,
@ -76,6 +79,18 @@ class TestResponsesUtils:
== 'Hmm, the user has just started with a simple "Hello,"'
)
tool_call_output = ResponseFunctionToolCallOutputItem(
id="temp_id",
type="function_call_output",
call_id="temp",
output="1234",
status="completed",
)
formatted_item = construct_chat_message_with_tool_call(tool_call_output)
assert formatted_item["role"] == "tool"
assert formatted_item["content"] == "1234"
assert formatted_item["tool_call_id"] == "temp"
item = ResponseReasoningItem(
id="lol",
summary=[],

View File

@ -13,12 +13,6 @@ from vllm.attention.layer import Attention, MultiHeadAttention
from vllm.platforms import current_platform
from vllm.utils.mem_utils import get_max_shared_memory_bytes
if not current_platform.is_rocm():
from xformers import ops as xops
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from tests.kernels.utils import make_alibi_bias
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability.
# - 512 as a buffer
@ -448,129 +442,6 @@ def ref_multi_query_kv_attention(
return torch.cat(ref_outputs, dim=0)
@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.skipif(
current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
)
@torch.inference_mode()
def test_multi_query_kv_attention(
num_seqs: int,
num_heads: tuple[int, int],
head_size: int,
dtype: torch.dtype,
seed: int,
device: str,
use_alibi: bool = False,
) -> None:
current_platform.seed_everything(seed)
torch.set_default_device(device)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use
# a smaller MAX_SEQ_LEN here.
max_len = min(MAX_SEQ_LEN, 4096)
seq_lens = random.sample(range(1, max_len), num_seqs)
num_tokens = sum(seq_lens)
scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads
qkv = torch.empty(
num_tokens, num_query_heads + 2 * num_kv_heads, head_size, dtype=dtype
)
qkv.uniform_(-scale, scale)
query, key, value = qkv.split([num_query_heads, num_kv_heads, num_kv_heads], dim=1)
num_queries_per_kv = num_query_heads // num_kv_heads
if num_queries_per_kv > 1:
# Handle MQA and GQA
key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
alibi_bias = None
if use_alibi:
alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
output = torch.empty_like(query)
start = 0
# Dynamic sequence length not supported with custom attn_bias.
for i, seq_len in enumerate(seq_lens):
end = start + seq_len
out = xops.memory_efficient_attention_forward(
query[None, start:end],
key[None, start:end],
value[None, start:end],
attn_bias=attn_bias[i],
p=0.0,
scale=scale,
)
output[start:end].copy_(out.view_as(query[start:end]))
start += seq_len
# xformers.AttentionBias to Tensor for use in reference impl.
alibi_bias = [
b.materialize((1, num_query_heads, i, i), device=device).squeeze()
for b, i in zip(attn_bias, seq_lens)
]
else:
attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
output = xops.memory_efficient_attention_forward(
query.unsqueeze(0),
key.unsqueeze(0),
value.unsqueeze(0),
attn_bias=attn_bias,
p=0.0,
scale=scale,
)
output = output.squeeze(0)
cu_seq_lens = [0]
for seq_len in seq_lens:
cu_seq_lens.append(cu_seq_lens[-1] + seq_len)
ref_output = ref_multi_query_kv_attention(
cu_seq_lens,
query,
key,
value,
scale,
alibi_bias,
dtype,
)
atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", [64])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.skipif(
current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
)
@torch.inference_mode()
def test_multi_query_kv_attention_with_alibi(
num_seqs: int,
num_heads: tuple[int, int],
head_size: int,
dtype: torch.dtype,
seed: int,
device: str,
) -> None:
return test_multi_query_kv_attention(
num_seqs,
num_heads,
head_size,
dtype,
seed,
device,
use_alibi=True,
)
@pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
head_size = 64

Some files were not shown because too many files have changed in this diff Show More