Merge branch 'main' into pynccl_symm_fix

This commit is contained in:
Amir Samani 2025-11-25 10:31:22 -08:00 committed by GitHub
commit f5f1ab26c8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
308 changed files with 12795 additions and 3905 deletions

View File

@ -17,7 +17,17 @@ wait_for_server() {
}
MODEL="deepseek-ai/DeepSeek-V2-lite"
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
# Set BACKENDS based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
fi
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then

View File

@ -1,10 +1,12 @@
#!/usr/bin/env bash
set -euxo pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
THRESHOLD=${1:-0.8}
NUM_Q=${2:-1319}
PORT=${3:-8020}
DATA_PARALLEL_SIZE=${4:-2}
TENSOR_PARALLEL_SIZE=${5:-2}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
@ -17,7 +19,16 @@ wait_for_server() {
}
MODEL="QWen/Qwen3-30B-A3B-FP8"
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
# Set BACKENDS based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
fi
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
@ -36,8 +47,10 @@ for BACK in "${BACKENDS[@]}"; do
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
--tensor-parallel-size 2 \
--data-parallel-size 2 \
--enable-eplb \
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
--data-parallel-size ${DATA_PARALLEL_SIZE} \
--enable-expert-parallel \
--trust-remote-code \
--max-model-len 2048 \

View File

@ -754,6 +754,7 @@ steps:
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
- vllm/transformers_utils/
- tests/models/test_initialization.py
commands:
# Only when vLLM model source is modified - test initialization of a large
@ -1319,7 +1320,10 @@ steps:
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
- pytest -v -s -x lora/test_gptoss_tp.py
# Disabled for now because MXFP4 backend on non-cuda platform
# doesn't support LoRA yet
#- pytest -v -s -x lora/test_gptoss_tp.py
- label: Weight Loading Multiple GPU Test # 33min
@ -1482,4 +1486,4 @@ steps:
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020

View File

@ -192,6 +192,7 @@ steps:
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
@ -691,6 +692,7 @@ steps:
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
- vllm/transformers_utils/
- tests/models/test_initialization.py
commands:
# Only when vLLM model source is modified - test initialization of a large
@ -901,11 +903,12 @@ steps:
- label: Transformers Nightly Models Test
working_dir: "/vllm-workspace/"
optional: true
soft_fail: true
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
- pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py
# - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
@ -969,6 +972,7 @@ steps:
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- vllm/model_executor/layers/fused_moe/layer.py
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
@ -1115,6 +1119,7 @@ steps:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
@ -1339,11 +1344,20 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
- label: Qwen3-30B-A3B-FP8-block Accuracy
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
timeout_in_minutes: 60
gpu: b200
optional: true
num_gpus: 2
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1

2
.github/CODEOWNERS vendored
View File

@ -9,6 +9,7 @@
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn
/vllm/model_executor/layers/batch_invariant.py @yewentao256
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
@ -59,6 +60,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/v1/kv_connector/nixl_integration @NickLucche
/tests/v1/kv_connector @ApostaC
/tests/v1/offloading @ApostaC
/tests/v1/determinism @yewentao256
# Transformers modeling backend
/vllm/model_executor/models/transformers @hmellor

View File

@ -136,7 +136,7 @@ elseif(HIP_FOUND)
# ROCm 5.X and 6.X
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
"expected for ROCm build, saw ${Torch_VERSION} instead.")
endif()
@ -604,12 +604,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else()
message(STATUS "Not building NVFP4 as no compatible archs were found.")

View File

@ -2,7 +2,7 @@
This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
Currently this just includes dense GEMMs and only works on Hopper GPUs.
Currently, this just includes dense GEMMs and only works on Hopper GPUs.
## Setup

View File

@ -41,11 +41,12 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
const double scale, const std::string& kv_cache_dtype);
void gather_and_maybe_dequant_cache(
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
int64_t batch_size, const std::string& kv_cache_dtype,
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
int64_t num_tokens, const std::string& kv_cache_dtype,
torch::Tensor const& scale,
std::optional<torch::Tensor> seq_starts = std::nullopt);

View File

@ -905,91 +905,79 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
namespace vllm {
// grid is launched with dimensions (batch, num_splits)
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt,
int ENTRY_SIZE, int CTA_SIZE>
__global__ void gather_and_maybe_dequant_cache(
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
// ENTRIES...]
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
const int32_t block_size, const int32_t entry_size,
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
// ENTRIES...]
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
const int32_t* __restrict__ token_to_seq, // [MAX_TOKEN_ACROSS_CHUNK]
const int32_t num_tokens, const int32_t block_size,
const int64_t block_table_stride, const int64_t cache_block_stride,
const int64_t cache_entry_stride, const int64_t dst_entry_stride,
const float* __restrict__ scale,
const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per
// batch
constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
using ltype = vllm::vec_n_t<cache_t, vec_size>;
using stype = vllm::vec_n_t<scalar_t, vec_size>;
// We are adding this for code readability which will be optimized out when
// build in release.
assert(CTA_SIZE == blockDim.x);
const int64_t bid = blockIdx.x; // Batch ID
const int32_t num_splits = gridDim.y;
const int32_t split = blockIdx.y;
const int32_t seq_start = cu_seq_lens[bid];
const int32_t seq_end = cu_seq_lens[bid + 1];
const int32_t seq_len = seq_end - seq_start;
const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
#pragma unroll
for (int token_id = blockIdx.x; token_id < num_tokens;
token_id += gridDim.x) {
int64_t batch_id = token_to_seq[token_id];
int64_t batch_start = cu_seq_lens[batch_id];
int64_t batch_end = cu_seq_lens[batch_id + 1];
int32_t batch_offset = token_id - batch_start;
const int32_t split_start = split * split_blocks;
const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
if (token_id >= batch_end) return;
int32_t offset = 0;
if (seq_starts != nullptr) {
offset = seq_starts[batch_id];
}
batch_offset += offset;
int32_t block_table_id = batch_offset / block_size;
int32_t slot_id = batch_offset % block_size;
int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
int32_t block_id = block_table[block_table_offset];
int64_t cache_offset =
block_id * cache_block_stride + slot_id * cache_entry_stride;
constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
scalar_t* dst_ = dst + token_id * dst_entry_stride;
cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;
const bool is_active_split = (split_start < tot_blocks);
const bool is_last_split = (split_end == tot_blocks);
if (!is_active_split) return;
int32_t full_blocks_end = split_end;
int32_t partial_block_size = 0;
// Adjust the pointer for the block_table for this batch.
// If seq_starts is provided, compute an offset based on (seq_starts[bid] /
// page_size)
const int32_t batch_offset = bid * block_table_stride;
int32_t offset = 0;
if (seq_starts != nullptr) {
offset = seq_starts[bid] / block_size;
}
const int32_t* batch_block_table = block_table + batch_offset + offset;
// Adjust dst pointer based on the cumulative sequence lengths.
dst += seq_start * dst_entry_stride;
if (is_last_split) {
partial_block_size = seq_len % block_size;
if (partial_block_size) full_blocks_end -= 1;
}
auto copy_entry = [&](const cache_t* __restrict__ _src,
scalar_t* __restrict__ _dst) {
for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
#pragma unroll
for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) {
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
_dst[i] = static_cast<scalar_t>(_src[i]);
reinterpret_cast<stype*>(dst_)[idx] =
static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
} else {
_dst[i] =
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx];
stype store_val;
#pragma unroll
for (int j = 0; j < vec_size; ++j) {
store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
loaded_val.val[j], *scale);
}
reinterpret_cast<stype*>(dst_)[idx] = store_val;
}
}
};
const auto loop_end =
std::min((int64_t)full_blocks_end, block_table_stride - offset);
for (int pid = split_start; pid < loop_end; ++pid) {
auto block_id = batch_block_table[pid];
auto block_start_ptr = src_cache + block_id * cache_block_stride;
auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
for (int eid = 0; eid < block_size; ++eid) {
copy_entry(block_start_ptr + eid * cache_entry_stride,
block_dst_ptr + eid * dst_entry_stride);
}
}
if (partial_block_size) {
if (offset + full_blocks_end < block_table_stride) {
auto block_id = batch_block_table[full_blocks_end];
auto block_start_ptr = src_cache + block_id * cache_block_stride;
auto block_dst_ptr =
dst + full_blocks_end * block_size * dst_entry_stride;
for (int eid = 0; eid < partial_block_size; ++eid) {
copy_entry(block_start_ptr + eid * cache_entry_stride,
block_dst_ptr + eid * dst_entry_stride);
// process tail
constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
dst_ = dst_ + ENTRY_SIZE - tail_cnt;
src_ = src_ + ENTRY_SIZE - tail_cnt;
#pragma unroll
for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) {
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
dst_[idx] = static_cast<scalar_t>(src_[idx]);
} else {
dst_[idx] =
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale);
}
}
}
@ -1001,34 +989,38 @@ __global__ void gather_and_maybe_dequant_cache(
// SCALAR_T is the data type of the destination tensor.
// CACHE_T is the stored data type of kv-cache.
// KV_DTYPE is the real data type of kv-cache.
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
block_size, entry_size, block_table_stride, cache_block_stride, \
cache_entry_stride, dst_entry_stride, \
reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576, \
thread_block_size> \
<<<grid, block, 0, stream>>>( \
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
token_to_seq.data_ptr<int32_t>(), num_tokens, block_size, \
block_table_stride, cache_block_stride, cache_entry_stride, \
dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
seq_starts_ptr);
// Gather sequences from the cache into the destination tensor.
// - cu_seq_lens contains the cumulative sequence lengths for each batch
// - block_table contains the cache block indices for each sequence
// - token_to_seq contains the back mapping from token_id to batch_id
// - Optionally, seq_starts (if provided) offsets the starting block index by
// (seq_starts[bid] / page_size)
void gather_and_maybe_dequant_cache(
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
int64_t batch_size, const std::string& kv_cache_dtype,
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
int64_t num_tokens, const std::string& kv_cache_dtype,
torch::Tensor const& scale,
std::optional<torch::Tensor> seq_starts = std::nullopt) {
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int32_t block_size = src_cache.size(1);
int32_t entry_size = src_cache.flatten(2, -1).size(2);
int32_t head_dim = dst.size(-1);
TORCH_CHECK(block_table.dtype() == torch::kInt32,
"block_table must be int32");
@ -1038,6 +1030,9 @@ void gather_and_maybe_dequant_cache(
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
"seq_starts must be int32");
}
TORCH_CHECK(head_dim == 576,
"gather_and_maybe_dequant_cache only support the head_dim to 576 "
"for better performance")
TORCH_CHECK(src_cache.device() == dst.device(),
"src_cache and dst must be on the same device");
@ -1055,10 +1050,9 @@ void gather_and_maybe_dequant_cache(
int64_t cache_entry_stride = src_cache.stride(1);
int64_t dst_entry_stride = dst.stride(0);
// Decide on the number of splits based on the batch size.
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
dim3 grid(batch_size, num_splits);
dim3 block(1024);
constexpr int32_t thread_block_size = 64;
dim3 grid(num_tokens);
dim3 block(thread_block_size);
const int32_t* seq_starts_ptr =
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;

View File

@ -13,6 +13,18 @@
#define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
#endif
#ifdef __aarch64__
#include "cpu_attn_neon.hpp"
#define NEON_DISPATCH(...) \
case cpu_attention::ISA::NEON: { \
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
scalar_t, head_dim>; \
return __VA_ARGS__(); \
}
#else
#define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
#endif // #ifdef __aarch64__
#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
case HEAD_DIM: { \
constexpr size_t head_dim = HEAD_DIM; \
@ -41,6 +53,7 @@
[&] { \
switch (ISA_TYPE) { \
AMX_DISPATCH(__VA_ARGS__) \
NEON_DISPATCH(__VA_ARGS__) \
case cpu_attention::ISA::VEC: { \
using attn_impl = \
cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t, \
@ -73,6 +86,8 @@ torch::Tensor get_scheduler_metadata(
isa = cpu_attention::ISA::VEC;
} else if (isa_hint == "vec16") {
isa = cpu_attention::ISA::VEC16;
} else if (isa_hint == "neon") {
isa = cpu_attention::ISA::NEON;
} else {
TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
}
@ -158,6 +173,8 @@ void cpu_attn_reshape_and_cache(
return cpu_attention::ISA::VEC;
} else if (isa == "vec16") {
return cpu_attention::ISA::VEC16;
} else if (isa == "neon") {
return cpu_attention::ISA::NEON;
} else {
TORCH_CHECK(false, "Invalid ISA type: " + isa);
}

View File

@ -14,7 +14,7 @@
#include "utils.hpp"
namespace cpu_attention {
enum class ISA { AMX, VEC, VEC16 };
enum class ISA { AMX, VEC, VEC16, NEON };
template <ISA isa, typename scalar_t, int64_t head_dim>
class AttentionImpl {};
@ -143,6 +143,12 @@ struct AttentionMetadata {
case ISA::VEC:
ss << "VEC, ";
break;
case ISA::VEC16:
ss << "VEC16, ";
break;
case ISA::NEON:
ss << "NEON, ";
break;
}
ss << "workitem_group_num: " << workitem_group_num
<< ", reduction_item_num: " << reduction_item_num
@ -841,7 +847,7 @@ struct VecTypeTrait<c10::BFloat16> {
};
#endif
#if !defined(__powerpc__)
#if !defined(__powerpc__) && !defined(__s390x__)
template <>
struct VecTypeTrait<c10::Half> {
using vec_t = vec_op::FP16Vec16;

386
csrc/cpu/cpu_attn_neon.hpp Normal file
View File

@ -0,0 +1,386 @@
#ifndef CPU_ATTN_NEON_HPP
#define CPU_ATTN_NEON_HPP
#include "cpu_attn_impl.hpp"
#include <arm_neon.h>
#include <type_traits>
namespace cpu_attention {
namespace {
#define BLOCK_SIZE_ALIGNMENT 32
#define HEAD_SIZE_ALIGNMENT 32
#define MAX_Q_HEAD_NUM_PER_ITER 16
// These do not use vectorized class for loading / converting
// because csrc/cpu/cpu_types_arm.hpp does not have fallback options
// for vec_op::BF16Vec* / vec_op::BF16Vec* on Arm HW that
// doesn't support BF16.
// We don't use vec_op::FP32Vec* or vec_op::FP16Vec* for consistency.
template <typename kv_cache_t>
FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, float32x4_t& b0,
float32x4_t& b1);
template <>
FORCE_INLINE void load_row8_B_as_f32<float>(const float* p, float32x4_t& b0,
float32x4_t& b1) {
b0 = vld1q_f32(p + 0);
b1 = vld1q_f32(p + 4);
}
template <>
FORCE_INLINE void load_row8_B_as_f32<c10::Half>(const c10::Half* p,
float32x4_t& b0,
float32x4_t& b1) {
const float16_t* h = reinterpret_cast<const float16_t*>(p);
float16x8_t v = vld1q_f16(h);
b0 = vcvt_f32_f16(vget_low_f16(v));
b1 = vcvt_f32_f16(vget_high_f16(v));
}
template <>
FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
float32x4_t& b0,
float32x4_t& b1) {
const uint16_t* u = reinterpret_cast<const uint16_t*>(p);
#ifdef ARM_BF16_SUPPORT
uint16x8_t u0 = vld1q_u16(u);
bfloat16x8_t bf0 = vreinterpretq_bf16_u16(u0);
b0 = vcvtq_low_f32_bf16(bf0);
b1 = vcvtq_high_f32_bf16(bf0);
#else
uint16x8_t x0 = vld1q_u16(u);
uint32x4_t lo = vshlq_n_u32(vmovl_u16(vget_low_u16(x0)), 16);
uint32x4_t hi = vshlq_n_u32(vmovl_u16(vget_high_u16(x0)), 16);
b0 = vreinterpretq_f32_u32(lo);
b1 = vreinterpretq_f32_u32(hi);
#endif
}
// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs
// #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2)
// #FMLAs = (K // 4) * (4 * 2 * M)
// We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads
template <int32_t M, typename kv_cache_t>
FORCE_INLINE void gemm_micro_neon_fmla_Mx8_Ku4(
const float* __restrict A, // [M x K],
const kv_cache_t* __restrict B, // [K x 8],
float* __restrict C, // [M x 8],
int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) {
// kernel supports max M of 8, as it'd spill for larger M
static_assert(1 <= M && M <= 8, "M must be in [1,8]");
// helpers for per-M codegen
#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7)
#define IF_M(i) if constexpr (M > (i))
// A row base pointers
#define DECL_A(i) const float* a##i = A + (i) * lda;
ROWS_APPLY(DECL_A)
#undef DECL_A
// declare 2 accumulators per row of M
#define DECL_ACC(i) float32x4_t acc##i##_0, acc##i##_1;
ROWS_APPLY(DECL_ACC)
#undef DECL_ACC
// initialize accumulators
#define INIT_ACC(i) \
IF_M(i) { \
if (accumulate) { \
acc##i##_0 = vld1q_f32(C + (i) * ldc + 0); \
acc##i##_1 = vld1q_f32(C + (i) * ldc + 4); \
} else { \
acc##i##_0 = vdupq_n_f32(0.f); \
acc##i##_1 = vdupq_n_f32(0.f); \
} \
}
ROWS_APPLY(INIT_ACC)
#undef INIT_ACC
int32_t k = 0;
// K unrolled by 4
for (; k + 3 < K; k += 4) {
// load A[k..k+3] for each active row (M)
#define LOAD_A4(i) \
float32x4_t a##i##v; \
IF_M(i) a##i##v = vld1q_f32(a##i + k);
ROWS_APPLY(LOAD_A4)
#undef LOAD_A4
// helper: FMA lane L from aiv
#define FMAS_LANE(i, aiv, L) \
IF_M(i) { \
acc##i##_0 = vfmaq_laneq_f32(acc##i##_0, b0, aiv, L); \
acc##i##_1 = vfmaq_laneq_f32(acc##i##_1, b1, aiv, L); \
}
// k + 0
{
float32x4_t b0, b1;
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 0) * ldb, b0, b1);
#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0)
ROWS_APPLY(STEP_K0)
#undef STEP_K0
}
// k + 1
{
float32x4_t b0, b1;
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 1) * ldb, b0, b1);
#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1)
ROWS_APPLY(STEP_K1)
#undef STEP_K1
}
// k + 2
{
float32x4_t b0, b1;
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 2) * ldb, b0, b1);
#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2)
ROWS_APPLY(STEP_K2)
#undef STEP_K2
}
// k + 3
{
float32x4_t b0, b1;
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 3) * ldb, b0, b1);
#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3)
ROWS_APPLY(STEP_K3)
#undef STEP_K3
}
#undef FMAS_LANE
}
// K tail
for (; k < K; ++k) {
float32x4_t b0, b1;
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)k * ldb, b0, b1);
#define TAIL_ROW(i) \
IF_M(i) { \
float32x4_t ai = vdupq_n_f32(*(a##i + k)); \
acc##i##_0 = vfmaq_f32(acc##i##_0, b0, ai); \
acc##i##_1 = vfmaq_f32(acc##i##_1, b1, ai); \
}
ROWS_APPLY(TAIL_ROW)
#undef TAIL_ROW
}
// store accumulators to C
#define STORE_ROW(i) \
IF_M(i) { \
vst1q_f32(C + (i) * ldc + 0, acc##i##_0); \
vst1q_f32(C + (i) * ldc + 4, acc##i##_1); \
}
ROWS_APPLY(STORE_ROW)
#undef STORE_ROW
#undef ROWS_APPLY
#undef IF_M
}
template <int32_t N, typename kv_cache_t>
FORCE_INLINE void gemm_macro_neon_fmla_Mx8_Ku4(const float* __restrict A,
const kv_cache_t* __restrict B,
float* __restrict C, int32_t M,
int32_t K, int64_t lda,
int64_t ldb, int64_t ldc,
bool accumulate) {
// micro kernel is Mx8
static_assert(N % 8 == 0, "N must be a multiple of 8");
for (int32_t m = 0; m < M;) {
int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1;
const float* Ab = A + m * lda;
float* Cb = C + m * ldc;
for (int32_t n = 0; n < N; n += 8) {
const kv_cache_t* Bn = B + n;
float* Cn = Cb + n;
switch (mb) {
case 8:
gemm_micro_neon_fmla_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
K, accumulate);
break;
case 4:
gemm_micro_neon_fmla_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
K, accumulate);
break;
case 2:
gemm_micro_neon_fmla_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
K, accumulate);
break;
default:
gemm_micro_neon_fmla_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
K, accumulate);
break;
}
}
// no tail loop for N as it's guaranteed to be a multiple of 8
m += mb;
}
}
template <typename kv_cache_t>
class TileGemmNeonFMLA {
public:
template <AttentionGemmPhase phase, int32_t k_size>
FORCE_INLINE static void gemm(const int32_t m_size,
float* __restrict__ a_tile,
kv_cache_t* __restrict__ b_tile,
float* __restrict__ c_tile, const int64_t lda,
const int64_t ldb, const int64_t ldc,
const int32_t block_size,
const int32_t dynamic_k_size,
const bool accum_c) {
if constexpr (phase == AttentionGemmPhase::QK) {
gemm_macro_neon_fmla_Mx8_Ku4<BLOCK_SIZE_ALIGNMENT, kv_cache_t>(
a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c);
} else {
gemm_macro_neon_fmla_Mx8_Ku4<HEAD_SIZE_ALIGNMENT, kv_cache_t>(
a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc,
accum_c);
}
}
};
} // namespace
// this is similar to "ISA::VEC" at the moment
template <typename scalar_t, int64_t head_dim>
class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
public:
using query_t = scalar_t;
using q_buffer_t = float;
using kv_cache_t = scalar_t;
using logits_buffer_t = float;
using partial_output_buffer_t = float;
using prob_buffer_t = float;
constexpr static int64_t BlockSizeAlignment =
BLOCK_SIZE_ALIGNMENT; // KV token num unit of QK and PV phases
constexpr static int64_t HeadDimAlignment =
HEAD_SIZE_ALIGNMENT; // headdim num unit of PV phase
constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER;
constexpr static int64_t HeadDim = head_dim;
constexpr static ISA ISAType = ISA::NEON;
constexpr static bool scale_on_logits = false; // apply scale on q_buffer
static_assert(HeadDim % HeadDimAlignment == 0);
// the gemm micro kernel is Mx8
static_assert(HeadDimAlignment % 8 == 0);
static_assert(BlockSizeAlignment % 8 == 0);
public:
template <template <typename tile_gemm_t> typename attention>
FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
attention<TileGemmNeonFMLA<kv_cache_t>> attention_iteration;
attention_iteration(CPU_ATTENTION_PARAMS);
}
// k_cache_token_group_stride: stride of K cache when move to next
// BlockSizeAlignment tokens in a block
constexpr static int64_t k_cache_token_group_stride(
const int32_t block_size) {
return BlockSizeAlignment; // layout of k_cache block is [head_dim,
// block_size], row-major
}
// v_cache_token_group_stride: stride of V cache when move to next
// BlockSizeAlignment tokens in a block
constexpr static int64_t v_cache_token_group_stride(
const int32_t block_size) {
return head_dim * BlockSizeAlignment; // layout of v_cache is [block_size,
// head_dim], row-major
}
// v_cache_head_group_stride: stride of V cache when move to next
// HeadDimAlignment head dims in a block
constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
return HeadDimAlignment; // layout of v_cache is [block_size, head_dim],
// row-major
}
// Copy q to q_buffer and cast it to fp32
static void copy_q_heads_tile(
scalar_t* __restrict__ src, // [q_num, q_heads_per_kv, head_size]
float* __restrict__ q_buffer, const int32_t q_num,
const int32_t q_heads_per_kv, const int64_t q_num_stride,
const int64_t q_head_stride, float scale) {
static_assert(head_dim % 16 == 0);
constexpr int32_t unroll_size = head_dim / 16;
using load_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
vec_op::FP32Vec16 scale_vec(scale);
for (int32_t q_num_idx = 0; q_num_idx < q_num; ++q_num_idx) {
for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv; ++q_head_idx) {
scalar_t* __restrict__ curr_q =
src + q_num_idx * q_num_stride + q_head_idx * q_head_stride;
float* __restrict__ curr_q_buffer =
q_buffer + q_num_idx * q_heads_per_kv * head_dim +
q_head_idx * head_dim;
vec_op::unroll_loop<int32_t, unroll_size>([&](int32_t i) {
load_vec_t vec(curr_q);
vec_op::FP32Vec16 fp32_vec(vec);
fp32_vec = fp32_vec * scale_vec;
fp32_vec.save(curr_q_buffer);
curr_q += 16;
curr_q_buffer += 16;
});
}
}
}
// reshape K as column-major and V as row-major
static void reshape_and_cache(
const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
const int64_t* __restrict__ slot_mapping, const int64_t token_num,
const int64_t key_token_num_stride, const int64_t value_token_num_stride,
const int64_t head_num, const int64_t key_head_num_stride,
const int64_t value_head_num_stride, const int64_t num_blocks,
const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
const int64_t block_size, const int64_t block_size_stride) {
#pragma omp parallel for collapse(2)
for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
const int64_t pos = slot_mapping[token_idx];
if (pos < 0) {
// skip
continue;
}
const int64_t block_idx = pos / block_size;
const int64_t block_offset = pos % block_size;
{
// Write Key
const scalar_t* key_start_ptr = key +
token_idx * key_token_num_stride +
head_idx * key_head_num_stride;
scalar_t* key_cache_start_ptr =
key_cache + block_idx * num_blocks_stride +
head_idx * cache_head_num_stride + block_offset;
#pragma GCC unroll 8
for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
key_cache_start_ptr[j] = key_start_ptr[i];
}
}
{
// Write Value
const scalar_t* value_start_ptr = value +
token_idx * value_token_num_stride +
head_idx * value_head_num_stride;
scalar_t* value_cache_start_ptr =
value_cache + block_idx * num_blocks_stride +
head_idx * cache_head_num_stride + block_offset * head_dim;
std::memcpy(value_cache_start_ptr, value_start_ptr,
sizeof(scalar_t) * head_dim);
}
}
}
}
};
} // namespace cpu_attention
#endif // #ifndef CPU_ATTN_NEON_HPP

View File

@ -4,6 +4,7 @@
#include <vecintrin.h>
#include <cmath>
#include <limits>
#include <torch/all.h>
namespace vec_op {
@ -174,8 +175,9 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
}
explicit FP32Vec8(const BF16Vec8& v) {
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
// On big-endian s390x, place BF16 first to get correct byte order
reg.val[0] = (__vector float)vec_mergeh(v.reg, zero);
reg.val[1] = (__vector float)vec_mergel(v.reg, zero);
}
float reduce_sum() const {
@ -189,51 +191,257 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
}
FP32Vec8 exp() const {
// TODO: Vectorize this
AliasReg ar;
ar.reg = reg;
f32x4x4_t ret;
ret.val[0][0] = std::exp(ar.values[0]);
ret.val[0][1] = std::exp(ar.values[1]);
ret.val[0][2] = std::exp(ar.values[2]);
ret.val[0][3] = std::exp(ar.values[3]);
ret.val[1][0] = std::exp(ar.values[4]);
ret.val[1][1] = std::exp(ar.values[5]);
ret.val[1][2] = std::exp(ar.values[6]);
ret.val[1][3] = std::exp(ar.values[7]);
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
f32x4x2_t out;
const __vector float log2e = vec_splats(1.44269504088896341f);
const __vector float one = vec_splats(1.0f);
const __vector float min_x = vec_splats(-87.3f);
const __vector float max_x = vec_splats(88.7f);
// 5th-degree minimax polynomial for 2^r (r in [0,1))
const __vector float c1 = vec_splats(0.6931471805599453f);
const __vector float c2 = vec_splats(0.240226506959101f);
const __vector float c3 = vec_splats(0.05550410866482158f);
const __vector float c4 = vec_splats(0.009618129107628477f);
const __vector float c5 = vec_splats(0.0013333558146428443f);
for (int i = 0; i < 2; i++) {
__vector float x = reg.val[i];
x = vec_max(x, min_x);
x = vec_min(x, max_x);
__vector float y = vec_mul(x, log2e);
__vector float kf = vec_floor(y);
__vector float r = vec_sub(y, kf);
__vector signed int k = vec_signed(kf);
const __vector signed int min_k = vec_splats((signed int)-126);
const __vector signed int max_k = vec_splats((signed int)127);
k = vec_min(vec_max(k, min_k), max_k);
// Build 2^k from exponent bits
__vector signed int exp_int = vec_add(k, vec_splats((signed int)127));
__vector unsigned int bits = (__vector unsigned int)exp_int;
bits = vec_sl(bits, vec_splats((unsigned int)23));
__vector float pow2k = (__vector float)bits;
// Improved minimax polynomial
__vector float poly = vec_madd(c5, r, c4);
poly = vec_madd(poly, r, c3);
poly = vec_madd(poly, r, c2);
poly = vec_madd(poly, r, c1);
poly = vec_madd(poly, r, one);
out.val[i] = vec_mul(pow2k, poly);
}
return FP32Vec8(out);
}
FP32Vec8 tanh() const {
// TODO: Vectorize this
AliasReg ar;
ar.reg = reg;
f32x4x4_t ret;
ret.val[0][0] = std::tanh(ar.values[0]);
ret.val[0][1] = std::tanh(ar.values[1]);
ret.val[0][2] = std::tanh(ar.values[2]);
ret.val[0][3] = std::tanh(ar.values[3]);
ret.val[1][0] = std::tanh(ar.values[4]);
ret.val[1][1] = std::tanh(ar.values[5]);
ret.val[1][2] = std::tanh(ar.values[6]);
ret.val[1][3] = std::tanh(ar.values[7]);
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
// tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
const __vector float one = vec_splats(1.0f);
const __vector float two = vec_splats(2.0f);
const __vector float zero = vec_splats(0.0f);
const __vector float sat =
vec_splats(9.0f); // beyond this, tanh(x) ~ sign(x)
f32x4x2_t out;
for (int i = 0; i < 2; i++) {
__vector float x = reg.val[i];
__vector float ax = vec_abs(x);
// sign(x): +1 or -1
__vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
// saturation mask: |x| > sat
__vector __bool int saturated = vec_cmpgt(ax, sat);
// 2x
__vector float two_x = vec_mul(x, two);
// Build a temporary FP32Vec8 with both lanes = 2x, reuse exp()
f32x4x2_t tmp;
tmp.val[0] = two_x;
tmp.val[1] = two_x;
FP32Vec8 exp_2x_vec(tmp);
FP32Vec8 e2x = exp_2x_vec.exp();
__vector float e = e2x.reg.val[i];
// tanh(x) = (e - 1) / (e + 1)
__vector float num = vec_sub(e, one);
__vector float den = vec_add(e, one);
__vector float t = vec_div(num, den);
// For large |x|, clamp to sign(x)
out.val[i] = vec_sel(t, sign, saturated);
}
return FP32Vec8(out);
}
FP32Vec8 er() const {
// TODO: Vectorize this
AliasReg ar;
ar.reg = reg;
f32x4x4_t ret;
ret.val[0][0] = std::erf(ar.values[0]);
ret.val[0][1] = std::erf(ar.values[1]);
ret.val[0][2] = std::erf(ar.values[2]);
ret.val[0][3] = std::erf(ar.values[3]);
ret.val[1][0] = std::erf(ar.values[4]);
ret.val[1][1] = std::erf(ar.values[5]);
ret.val[1][2] = std::erf(ar.values[6]);
ret.val[1][3] = std::erf(ar.values[7]);
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
// A&S 7.1.26 approximation:
// erf(x) = sign(x) * (1 - ((((a5*t + a4)*t + a3)*t + a2)*t + a1) * t *
// exp(-x^2)) t = 1 / (1 + p*|x|), p = 0.3275911
const __vector float one = vec_splats(1.0f);
const __vector float zero = vec_splats(0.0f);
const __vector float p = vec_splats(0.3275911f);
// Polynomial coeffs
const __vector float a1 = vec_splats(0.254829592f);
const __vector float a2 = vec_splats(-0.284496736f);
const __vector float a3 = vec_splats(1.421413741f);
const __vector float a4 = vec_splats(-1.453152027f);
const __vector float a5 = vec_splats(1.061405429f);
// Threshold where erf(x) ~ sign(x)
const __vector float sat = vec_splats(6.0f);
f32x4x2_t out;
for (int lane = 0; lane < 2; lane++) {
__vector float x = reg.val[lane];
__vector float ax = vec_abs(x);
// sign(x)
__vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
// |x| > 6 → erf(x) = ±1
__vector __bool int saturated = vec_cmpgt(ax, sat);
// t = 1 / (1 + p * |x|)
__vector float t = vec_madd(p, ax, one);
t = vec_div(one, t);
// poly = a5
__vector float poly = a5;
poly = vec_madd(poly, t, a4);
poly = vec_madd(poly, t, a3);
poly = vec_madd(poly, t, a2);
poly = vec_madd(poly, t, a1);
// full polynomial: poly = poly * t
poly = vec_mul(poly, t);
// Compute exp(-x^2)
__vector float x2 = vec_mul(x, x);
__vector float neg_x2 = vec_neg(x2);
f32x4x2_t tmp;
tmp.val[0] = neg_x2;
tmp.val[1] = neg_x2;
FP32Vec8 exp_neg_x2(tmp);
FP32Vec8 e = exp_neg_x2.exp();
__vector float ex = e.reg.val[lane];
// erf(x) = sign * (1 - poly * exp(-x^2))
__vector float term = vec_mul(poly, ex);
__vector float y = vec_sub(one, term);
y = vec_mul(y, sign);
// saturated → ±1
__vector float sat_val = vec_mul(sign, one);
out.val[lane] = vec_sel(y, sat_val, saturated);
}
return FP32Vec8(out);
}
// Elementwise sigmoid(x) = 1 / (1 + exp(-x))
FP32Vec8 sigmoid() const {
const __vector float one = vec_splats(1.0f);
f32x4x2_t neg;
for (int i = 0; i < 2; ++i) {
neg.val[i] = vec_neg(reg.val[i]);
}
FP32Vec8 neg_x(neg);
FP32Vec8 e = neg_x.exp(); // exp(-x)
f32x4x2_t denom;
for (int i = 0; i < 2; ++i) {
denom.val[i] = vec_add(one, e.reg.val[i]);
}
FP32Vec8 denom_vec(denom);
FP32Vec8 one_vec(1.0f);
return one_vec / denom_vec;
}
// Tanh-based GELU:
// gelu(x) = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x^3)))
FP32Vec8 gelu_tanh() const {
const __vector float k_s2pi = vec_splats(0.7978845608028654f); // √(2/π)
const __vector float k_0_0447 = vec_splats(0.044715f);
f32x4x2_t x2, x3, inner;
for (int i = 0; i < 2; ++i) {
__vector float x = reg.val[i];
x2.val[i] = vec_mul(x, x); // x^2
x3.val[i] = vec_mul(x2.val[i], x); // x^3
__vector float t = vec_madd(k_0_0447, x3.val[i], x); // x + 0.044715*x^3
inner.val[i] = vec_mul(k_s2pi, t); // √(2/π)*(...)
}
FP32Vec8 inner_vec(inner);
FP32Vec8 t = inner_vec.tanh(); // tanh part
FP32Vec8 one_vec(1.0f);
FP32Vec8 half_vec(0.5f);
FP32Vec8 x_vec(*this);
return x_vec * half_vec * (one_vec + t);
}
// Erf-based GELU:
// gelu(x) = 0.5 * x * (1 + erf(x / √2))
FP32Vec8 gelu_erf() const {
const __vector float inv_sqrt2 = vec_splats(0.7071067811865476f); // 1/√2
FP32Vec8 x_vec(*this);
f32x4x2_t scaled;
for (int i = 0; i < 2; ++i) {
scaled.val[i] = vec_mul(reg.val[i], inv_sqrt2);
}
FP32Vec8 x_scaled(scaled);
FP32Vec8 erf_x = x_scaled.er();
FP32Vec8 one_vec(1.0f);
FP32Vec8 half_vec(0.5f);
return x_vec * half_vec * (one_vec + erf_x);
}
// Elementwise reciprocal: 1/x (scalar per lane, for correctness)
FP32Vec8 rcp() const {
AliasReg in, out;
in.reg = reg;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
out.values[i] = 1.0f / in.values[i];
}
return FP32Vec8(out.reg);
}
// Elementwise rsqrt(x) = 1 / sqrt(x) (scalar per lane, for correctness)
FP32Vec8 rsqrt() const {
AliasReg in, out;
in.reg = reg;
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
out.values[i] = 1.0f / std::sqrt(in.values[i]);
}
return FP32Vec8(out.reg);
}
FP32Vec8 operator*(const FP32Vec8& b) const {
@ -316,10 +524,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
}
explicit FP32Vec16(const BF16Vec16& v) {
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
// On big-endian s390x, place BF16 first to get correct byte order
reg.val[0] = (__vector float)vec_mergeh(v.reg.val[0], zero);
reg.val[1] = (__vector float)vec_mergel(v.reg.val[0], zero);
reg.val[2] = (__vector float)vec_mergeh(v.reg.val[1], zero);
reg.val[3] = (__vector float)vec_mergel(v.reg.val[1], zero);
}
explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
@ -376,6 +585,23 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
return result;
}
FP32Vec16 max(const FP32Vec16& b) const {
return FP32Vec16(f32x4x4_t({vec_max(reg.val[0], b.reg.val[0]),
vec_max(reg.val[1], b.reg.val[1]),
vec_max(reg.val[2], b.reg.val[2]),
vec_max(reg.val[3], b.reg.val[3])}));
}
float reduce_max() const {
AliasReg ar;
ar.reg = reg;
float result = ar.values[0];
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) {
if (ar.values[i] > result) result = ar.values[i];
});
return result;
}
void save(float* ptr) const {
vec_xst(reg.val[0], 0, ptr);
vec_xst(reg.val[1], 16, ptr);
@ -402,15 +628,14 @@ struct VecType<c10::BFloat16> {
using vec_type = BF16Vec8;
};
// On s390x, FP16 (Half) is not natively supported, use FP32 vectors instead
using FP16Vec16 = FP32Vec16;
template <typename T>
void storeFP32(float v, T* ptr) {
*ptr = v;
}
inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
acc = acc + a * b;
}
namespace c10 {
struct BFloat16 {
uint16_t value; // Assume BFloat16 is defined as a struct containing a 16-bit
@ -429,6 +654,79 @@ inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
#define __VEC_CLASS_FP_NAN (1 << 6)
#endif
// Optimized FMA (Fused Multiply-Add) implementations using IBM Z vector
// intrinsics
// FP32Vec4 FMA: acc = acc + (a * b) or equivalently acc = fma(a, b, acc)
FORCE_INLINE void fma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
acc.reg = vec_madd(a.reg, b.reg, acc.reg);
}
// FP32Vec8 FMA: acc = acc + (a * b)
FORCE_INLINE void fma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
}
// FP32Vec16 FMA: acc = acc + (a * b)
FORCE_INLINE void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
acc.reg.val[2] = vec_madd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
acc.reg.val[3] = vec_madd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
}
// Multiply-Subtract: acc = acc - (a * b)
FORCE_INLINE void fms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
acc.reg = vec_msub(a.reg, b.reg, acc.reg);
}
FORCE_INLINE void fms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
}
FORCE_INLINE void fms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
acc.reg.val[2] = vec_msub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
acc.reg.val[3] = vec_msub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
}
// Negative Multiply-Add: acc = -(a * b) + acc
FORCE_INLINE void nfma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
acc.reg = vec_nmadd(a.reg, b.reg, acc.reg);
}
FORCE_INLINE void nfma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
}
FORCE_INLINE void nfma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
acc.reg.val[2] = vec_nmadd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
acc.reg.val[3] = vec_nmadd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
}
// Negative Multiply-Subtract: acc = -(a * b) - acc
FORCE_INLINE void nfms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
acc.reg = vec_nmsub(a.reg, b.reg, acc.reg);
}
FORCE_INLINE void nfms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
}
FORCE_INLINE void nfms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
acc.reg.val[2] = vec_nmsub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
acc.reg.val[3] = vec_nmsub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
}
const static __vector unsigned char omask = {2, 3, 6, 7, 10, 11, 14, 15,
18, 19, 22, 23, 26, 27, 30, 31};
const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
@ -441,13 +739,24 @@ const static __vector unsigned int one = {1, 1, 1, 1};
inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
__vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
__vector unsigned int lsb0 = inp0 >> sh16;
__vector unsigned int lsb1 = inp1 >> sh16;
lsb0 = lsb0 & one;
lsb1 = lsb1 & one;
__vector unsigned int rnd0 = lsb0 + bias;
__vector unsigned int rnd1 = lsb1 + bias;
inp0 = inp0 + rnd0;
inp1 = inp1 + rnd1;
int cc;
__vector __bool int sel0 =
vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
__vector __bool int sel1 =
vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
inp0 = vec_sel(inp0, nan, sel0) >> sh16;
inp1 = vec_sel(inp1, nan, sel1) >> sh16;
inp0 = vec_sel(inp0, nan, sel0);
inp1 = vec_sel(inp1, nan, sel1);
inp0 = inp0 >> sh16;
inp1 = inp1 >> sh16;
reg = (__vector signed short)vec_perm(inp0, inp1, omask);
}
@ -456,6 +765,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
__vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
__vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
__vector unsigned int lsb0 = inp0 >> sh16;
__vector unsigned int lsb1 = inp1 >> sh16;
__vector unsigned int lsb2 = inp2 >> sh16;
__vector unsigned int lsb3 = inp3 >> sh16;
lsb0 = lsb0 & one;
lsb1 = lsb1 & one;
lsb2 = lsb2 & one;
lsb3 = lsb3 & one;
__vector unsigned int rnd0 = lsb0 + bias;
__vector unsigned int rnd1 = lsb1 + bias;
__vector unsigned int rnd2 = lsb2 + bias;
__vector unsigned int rnd3 = lsb3 + bias;
inp0 = inp0 + rnd0;
inp1 = inp1 + rnd1;
inp2 = inp2 + rnd2;
inp3 = inp3 + rnd3;
int cc;
__vector __bool int sel0 =
vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
@ -465,15 +790,164 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc);
__vector __bool int sel3 =
vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc);
inp0 = vec_sel(inp0, nan, sel0) >> sh16;
inp1 = vec_sel(inp1, nan, sel1) >> sh16;
inp2 = vec_sel(inp2, nan, sel2) >> sh16;
inp3 = vec_sel(inp3, nan, sel3) >> sh16;
inp0 = vec_sel(inp0, nan, sel0);
inp1 = vec_sel(inp1, nan, sel1);
inp2 = vec_sel(inp2, nan, sel2);
inp3 = vec_sel(inp3, nan, sel3);
inp0 = inp0 >> sh16;
inp1 = inp1 >> sh16;
inp2 = inp2 >> sh16;
inp3 = inp3 >> sh16;
reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
}
inline void prefetch(const void* addr) { void __dcbt(const void* addr); }
// 1D softmax over `n` elements in `input`, writes result to `output`.
// Uses FP32Vec8 for main body, scalar tail handling.
// Requirement: n > 0
FORCE_INLINE void softmax_fp32vec8(float* output, const float* input, int n) {
if (n <= 0) return;
// ---------- Pass 1: find max ----------
float max_val = -std::numeric_limits<float>::infinity();
int i = 0;
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
FP32Vec8 v(input + i);
FP32Vec8::AliasReg ar;
ar.reg = v.reg;
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
if (ar.values[j] > max_val) max_val = ar.values[j];
}
}
for (; i < n; ++i) {
if (input[i] > max_val) max_val = input[i];
}
// ---------- Pass 2: compute exp(x - max) and sum ----------
float sum = 0.0f;
i = 0;
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
float tmp[FP32Vec8::VEC_ELEM_NUM];
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
tmp[j] = input[i + j] - max_val;
}
FP32Vec8 v(tmp);
FP32Vec8 e = v.exp();
FP32Vec8::AliasReg ar;
ar.reg = e.reg;
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
output[i + j] = ar.values[j];
sum += ar.values[j];
}
}
// Tail
for (; i < n; ++i) {
float x = input[i] - max_val;
float ex = std::exp(x); // scalar tail
output[i] = ex;
sum += ex;
}
// ---------- Pass 3: normalize ----------
float inv_sum = 1.0f / sum;
i = 0;
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
float tmp[FP32Vec8::VEC_ELEM_NUM];
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
tmp[j] = output[i + j] * inv_sum;
}
FP32Vec8 v(tmp);
v.save(output + i);
}
for (; i < n; ++i) {
output[i] *= inv_sum;
}
}
// 1D RMSNorm kernel:
// input: x[0..n-1]
// weight: w[0..n-1] (gamma), may be nullptr
// output: y[i] = x[i] * inv_rms * (weight[i] if weight != nullptr else 1)
// eps: small epsilon for numerical stability
FORCE_INLINE void rmsnorm_fp32vec8(float* output, const float* input,
const float* weight, int n, float eps) {
if (n <= 0) return;
// ---------- Pass 1: compute sum of squares ----------
float sum_sq = 0.0f;
int i = 0;
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
FP32Vec8 x_vec(input + i);
FP32Vec8 sq = x_vec * x_vec;
FP32Vec8::AliasReg ar;
ar.reg = sq.reg;
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
sum_sq += ar.values[j];
}
}
// Tail
for (; i < n; ++i) {
float v = input[i];
sum_sq += v * v;
}
float mean_sq = sum_sq / static_cast<float>(n);
float inv_rms = 1.0f / std::sqrt(mean_sq + eps);
// ---------- Pass 2: scale (and apply weight if given) ----------
const float inv_rms_f = inv_rms;
i = 0;
if (weight) {
// with gamma
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
FP32Vec8 x_vec(input + i);
float wtmp[FP32Vec8::VEC_ELEM_NUM];
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
wtmp[j] = weight[i + j];
}
FP32Vec8 w_vec(wtmp);
FP32Vec8 scale_vec(inv_rms_f);
FP32Vec8 y = x_vec * scale_vec * w_vec;
y.save(output + i);
}
for (; i < n; ++i) {
output[i] = input[i] * inv_rms_f * weight[i];
}
} else {
// without gamma
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
FP32Vec8 x_vec(input + i);
FP32Vec8 scale_vec(inv_rms_f);
FP32Vec8 y = x_vec * scale_vec;
y.save(output + i);
}
for (; i < n; ++i) {
output[i] = input[i] * inv_rms_f;
}
}
}
// Prefetch data to cache for better memory access performance
FORCE_INLINE void prefetch(const void* addr) {
__builtin_prefetch(addr, 0, 3); // 0=read, 3=high temporal locality
}
}; // namespace vec_op

View File

@ -22,15 +22,10 @@ torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
auto strides = cpu_tensor.strides();
auto options = cpu_tensor.options().device(torch::kCUDA);
// from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
// const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
// memory, so we don't free it here.
auto deleter = [](void*) {
// no-op, since the memory is owned by the original CPU tensor
};
// use default no-op deleter, since the memory is owned by the original CPU
// tensor
torch::Tensor cuda_tensor =
torch::from_blob(device_ptr, sizes, strides, deleter, options);
torch::from_blob(device_ptr, sizes, strides, options);
TORCH_CHECK(cuda_tensor.device().is_cuda(),
"Resulting tensor is not on CUDA device");

View File

@ -22,6 +22,7 @@
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#include "cutlass_extensions/common.hpp"
#include "cute/tensor.hpp"
#include "cutlass/tensor_ref.h"
@ -173,7 +174,7 @@ void run_get_group_gemm_starts(
}
template <typename OutType>
void run_fp4_blockwise_scaled_group_mm(
void run_fp4_blockwise_scaled_group_mm_sm100(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
@ -343,17 +344,225 @@ void run_fp4_blockwise_scaled_group_mm(
auto can_implement_status = gemm_op.can_implement(args);
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
"Failed to implement GEMM");
"Failed to implement GEMM: status=", (int)can_implement_status);
// Run the GEMM
auto status = gemm_op.initialize(args, workspace.data_ptr());
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
TORCH_CHECK(status == cutlass::Status::kSuccess,
"Failed to initialize GEMM: status=", (int)status,
" workspace_size=", workspace_size, " num_experts=", num_experts,
" M=", M, " N=", N, " K=", K);
status = gemm_op.run(args, workspace.data_ptr(), stream);
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
}
void run_fp4_blockwise_scaled_group_mm_sm120(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
int N, int K) {
using ProblemShape =
cutlass::gemm::GroupProblemShape<Shape<int32_t, int32_t, int32_t>>;
using ElementType = cutlass::float_e2m1_t;
using ElementSFType = cutlass::float_ue4m3_t;
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
// NOTE: For SM120 it seems templating the output type is not supported and
// we need to hardcode the output type to bfloat16
using ElementC = cutlass::bfloat16_t;
using ElementD = ElementC;
using ElementAccumulator = float;
// Layout definitions
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutC = cutlass::layout::RowMajor;
using LayoutD = LayoutC;
// Alignment constraints
static constexpr int AlignmentA = 32;
static constexpr int AlignmentB = 32;
static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
// Architecture definitions
using ArchTag = cutlass::arch::Sm120;
using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
using ClusterShape = Shape<_1, _1, _1>;
using MmaTileShape = Shape<_128, _128, _128>;
using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
ElementD, ElementAccumulator, ElementC, ElementAccumulator>;
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag, OperatorClass, MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
LayoutD*, AlignmentD,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation>::CollectiveOp;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag, OperatorClass, ElementA, LayoutA*, AlignmentA, ElementB,
LayoutB*, AlignmentB, ElementAccumulator, MmaTileShape, ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
using GemmKernel =
cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
CollectiveEpilogue>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
using StrideA = typename Gemm::GemmKernel::InternalStrideA;
using StrideB = typename Gemm::GemmKernel::InternalStrideB;
using StrideC = typename Gemm::GemmKernel::InternalStrideC;
using StrideD = typename Gemm::GemmKernel::InternalStrideD;
using LayoutSFA =
typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
using LayoutSFB =
typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
using ScaleConfig =
typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
int num_experts = static_cast<int>(expert_offsets.size(0));
auto options_int =
torch::TensorOptions().dtype(torch::kInt64).device(a.device());
torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
torch::Tensor c_strides1 =
torch::full({num_experts}, output.stride(0), options_int);
torch::Tensor a_strides1 =
torch::full({num_experts}, a.stride(0) * 2, options_int);
torch::Tensor b_strides1 =
torch::full({num_experts}, b.stride(1) * 2, options_int);
run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
expert_offsets, sf_offsets, problem_sizes, M, N, K);
// Create an instance of the GEMM
Gemm gemm_op;
// Initialize problem_sizes_as_shapes correctly
UnderlyingProblemShape* problem_sizes_as_shapes =
static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
// Set the Scheduler info
cutlass::KernelHardwareInfo hw_info;
using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
scheduler.raster_order = RasterOrderOptions::AlongM;
hw_info.device_id = a.get_device();
static std::unordered_map<int, int> cached_sm_counts;
if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) {
cached_sm_counts[hw_info.device_id] =
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
hw_info.device_id);
}
hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX);
// Mainloop Arguments
typename GemmKernel::MainloopArguments mainloop_args{
static_cast<const ElementType**>(a_ptrs.data_ptr()),
static_cast<StrideA*>(a_strides1.data_ptr()),
static_cast<const ElementType**>(b_ptrs.data_ptr()),
static_cast<StrideB*>(b_strides1.data_ptr()),
static_cast<const ElementSFType**>(a_scales_ptrs.data_ptr()),
reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
static_cast<const ElementSFType**>(b_scales_ptrs.data_ptr()),
reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())};
// Epilogue Arguments
typename GemmKernel::EpilogueArguments epilogue_args{
{}, // epilogue.thread
nullptr,
static_cast<StrideC*>(c_strides1.data_ptr()),
static_cast<ElementD**>(out_ptrs.data_ptr()),
static_cast<StrideC*>(c_strides1.data_ptr())};
auto& fusion_args = epilogue_args.thread;
fusion_args.alpha_ptr_array =
reinterpret_cast<float**>(alpha_ptrs.data_ptr());
fusion_args.dAlpha = {_0{}, _0{}, 1};
fusion_args.beta = 0.0f;
// Gemm Arguments
typename GemmKernel::Arguments args{
cutlass::gemm::GemmUniversalMode::kGrouped,
{num_experts, problem_sizes_as_shapes, nullptr},
mainloop_args,
epilogue_args,
hw_info,
scheduler};
size_t workspace_size = Gemm::get_workspace_size(args);
auto const workspace_options =
torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
auto workspace = torch::empty(workspace_size, workspace_options);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device());
auto can_implement_status = gemm_op.can_implement(args);
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
"Failed to implement GEMM: status=", (int)can_implement_status);
// Run the GEMM
auto status = gemm_op.initialize(args, workspace.data_ptr());
TORCH_CHECK(status == cutlass::Status::kSuccess,
"Failed to initialize GEMM: status=", (int)status,
" workspace_size=", workspace_size, " num_experts=", num_experts,
" M=", M, " N=", N, " K=", K);
status = gemm_op.run(args, workspace.data_ptr(), stream);
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
}
template <typename OutType>
void run_fp4_blockwise_scaled_group_mm(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
int N, int K) {
int32_t version_num = get_sm_version_num();
#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
if (version_num >= 120 && version_num < 130) {
run_fp4_blockwise_scaled_group_mm_sm120(
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
expert_offsets, sf_offsets, M, N, K);
return;
}
#endif
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
if (version_num >= 100 && version_num < 120) {
run_fp4_blockwise_scaled_group_mm_sm100<OutType>(
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
expert_offsets, sf_offsets, M, N, K);
return;
}
#endif
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"No compiled cutlass_fp4_group_mm kernel for CUDA device capability: ",
version_num, ". Required capability: 100 or 120");
}
#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
(defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
#endif
@ -374,7 +583,8 @@ void cutlass_fp4_group_mm(
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
(defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
// Input validation
CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
@ -408,6 +618,14 @@ void cutlass_fp4_group_mm(
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
expert_offsets, sf_offsets, M, N, K);
} else {
#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
int32_t version_num = get_sm_version_num();
if (version_num >= 120 && version_num < 130) {
TORCH_CHECK_NOT_IMPLEMENTED(
false, "SM120 NVFP4 MOE only supports bfloat16 output, got: ",
output.scalar_type());
}
#endif
run_fp4_blockwise_scaled_group_mm<cutlass::half_t>(
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
expert_offsets, sf_offsets, M, N, K);
@ -416,8 +634,8 @@ void cutlass_fp4_group_mm(
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"No compiled cutlass_fp4_group_mm kernel, vLLM must "
"be compiled with ENABLE_NVFP4_SM100 for SM100+ and CUDA "
"12.8 or above.");
"be compiled with ENABLE_NVFP4_SM100 or ENABLE_NVFP4_SM120 for SM100/120 "
"and CUDA 12.8 or above.");
#endif
}

View File

@ -307,7 +307,7 @@ constexpr auto FLOAT = at::ScalarType::Float;
constexpr auto INT = at::ScalarType::Int;
constexpr auto UINT8 = at::ScalarType::Byte;
void scaled_fp4_experts_quant_sm100a(
void scaled_fp4_experts_quant_sm1xxa(
torch::Tensor& output, torch::Tensor& output_scale,
torch::Tensor const& input, torch::Tensor const& input_global_scale,
torch::Tensor const& input_offset_by_experts,

View File

@ -24,8 +24,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
torch::Tensor const& input_sf);
#endif
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
void scaled_fp4_experts_quant_sm100a(
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
void scaled_fp4_experts_quant_sm1xxa(
torch::Tensor& output, torch::Tensor& output_scale,
torch::Tensor const& input, torch::Tensor const& input_global_scale,
torch::Tensor const& input_offset_by_experts,
@ -54,8 +55,9 @@ void scaled_fp4_experts_quant(
torch::Tensor const& input, torch::Tensor const& input_global_scale,
torch::Tensor const& input_offset_by_experts,
torch::Tensor const& output_scale_offset_by_experts) {
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
return scaled_fp4_experts_quant_sm100a(
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
return scaled_fp4_experts_quant_sm1xxa(
output, output_scale, input, input_global_scale, input_offset_by_experts,
output_scale_offset_by_experts);
#endif

View File

@ -67,9 +67,9 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
std::optional<torch::Tensor> const& bias);
#endif
#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
#if (defined(ENABLE_CUTLASS_MOE_SM90) && ENABLE_CUTLASS_MOE_SM90) || \
(defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100) || \
(defined(ENABLE_CUTLASS_MOE_SM120) && ENABLE_CUTLASS_MOE_SM120)
void get_cutlass_moe_mm_data_caller(
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@ -284,8 +284,9 @@ void get_cutlass_moe_mm_data(
// This function currently gets compiled only if we have a valid cutlass moe
// mm to run it for.
int32_t version_num = get_sm_version_num();
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
problem_sizes2, input_permutation,
output_permutation, num_experts, n, k,
@ -296,7 +297,7 @@ void get_cutlass_moe_mm_data(
false,
"No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
"CUDA device capability: ",
version_num, ". Required capability: 90 or 100");
version_num, ". Required capability: 90, 100, or 120");
}
void get_cutlass_moe_mm_problem_sizes(
@ -304,8 +305,9 @@ void get_cutlass_moe_mm_problem_sizes(
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
int32_t version_num = get_sm_version_num();
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
problem_sizes2, num_experts, n, k,
blockscale_offsets);
@ -315,7 +317,7 @@ void get_cutlass_moe_mm_problem_sizes(
false,
"No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
"kernel for CUDA device capability: ",
version_num, ". Required capability: 90 or 100");
version_num, ". Required capability: 90, 100, or 120");
}
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
@ -328,8 +330,9 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
// This function currently gets compiled only if we have a valid cutlass moe
// mm to run it for.
int32_t version_num = get_sm_version_num();
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
problem_sizes2, expert_num_tokens,
num_local_experts, padded_m, n, k);
@ -339,7 +342,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
false,
"No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
"for CUDA device capability: ",
version_num, ". Required capability: 90 or 100");
version_num, ". Required capability: 90, 100, or 120");
}
void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,

View File

@ -695,7 +695,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
cache_ops.def(
"gather_and_maybe_dequant_cache(Tensor src_cache, Tensor! dst, "
" Tensor block_table, Tensor cu_seq_lens, "
" int batch_size, "
" Tensor token_to_seq, "
" int num_tokens, "
" str kv_cache_dtype, "
" Tensor scale, Tensor? seq_starts) -> ()");
cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,

View File

@ -20,8 +20,8 @@ ARG PYTHON_VERSION=3.12
# glibc version is baked into the distro, and binaries built with one glibc
# version are not backwards compatible with OSes that use an earlier version.
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# TODO: Restore to base image after FlashInfer AOT wheel fixed
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
# their own mirror. When doing so, we don't benefit from the transparent
@ -85,7 +85,20 @@ ARG GET_PIP_URL
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo python3-pip \
&& apt-get install -y --no-install-recommends \
ccache \
software-properties-common \
git \
curl \
sudo \
python3-pip \
libibverbs-dev \
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
gcc-10 \
g++-10 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
&& rm -rf /var/lib/apt/lists/* \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
@ -110,10 +123,6 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
RUN apt-get install -y gcc-10 g++-10
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
RUN <<EOF
gcc --version
EOF
@ -224,6 +233,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
# Install DeepGEMM from source
ARG DEEPGEMM_GIT_REF
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
RUN --mount=type=cache,target=/root/.cache/uv \
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} --wheel-dir /tmp/deepgemm/dist
# Ensure the wheel dir exists so later-stage COPY won't fail when DeepGEMM is skipped
RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
# Install EP kernels(pplx-kernels and DeepEP)
RUN --mount=type=cache,target=/root/.cache/uv \
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
/tmp/install_python_libraries.sh /tmp/ep_kernels_workspace wheel && \
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
# Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py
@ -252,7 +277,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
ENV UV_LINK_MODE=copy
# Install libnuma-dev, required by fastsafetensors (fixes #20384)
RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*
COPY requirements/lint.txt requirements/lint.txt
COPY requirements/test.txt requirements/test.txt
COPY requirements/dev.txt requirements/dev.txt
@ -289,8 +314,15 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& apt-get install -y --no-install-recommends \
software-properties-common \
curl \
sudo \
python3-pip \
ffmpeg \
libsm6 \
libxext6 \
libgl1 \
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
mkdir -p -m 0755 /etc/apt/keyrings ; \
@ -305,13 +337,30 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
done ; \
fi \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
&& apt-get install -y --no-install-recommends \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-dev \
python${PYTHON_VERSION}-venv \
libibverbs-dev \
&& rm -rf /var/lib/apt/lists/* \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Install CUDA development tools and build essentials for runtime JIT compilation
# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
apt-get update -y && \
apt-get install -y --no-install-recommends \
cuda-nvcc-${CUDA_VERSION_DASH} \
cuda-cudart-${CUDA_VERSION_DASH} \
cuda-nvrtc-${CUDA_VERSION_DASH} \
cuda-cuobjdump-${CUDA_VERSION_DASH} \
libcublas-${CUDA_VERSION_DASH} && \
rm -rf /var/lib/apt/lists/*
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
@ -356,36 +405,32 @@ RUN --mount=type=cache,target=/root/.cache/uv \
. /etc/environment && \
uv pip list
# Even when we build Flashinfer with AOT mode, there's still
# some issues w.r.t. JIT compilation. Therefore we need to
# install build dependencies for JIT compilation.
# TODO: Remove this once FlashInfer AOT wheel is fixed
COPY requirements/build.txt requirements/build.txt
# Install deepgemm wheel that has been built in the `build` stage
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt \
--mount=type=bind,from=build,source=/tmp/deepgemm/dist,target=/tmp/deepgemm/dist,ro \
sh -c 'if ls /tmp/deepgemm/dist/*.whl >/dev/null 2>&1; then \
uv pip install --system /tmp/deepgemm/dist/*.whl; \
else \
echo "No DeepGEMM wheels to install; skipping."; \
fi'
# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH (https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/.ci/manywheel/build_cuda.sh#L141C14-L141C36)
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system ep_kernels/dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# Install DeepGEMM from source
ARG DEEPGEMM_GIT_REF
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
RUN --mount=type=cache,target=/root/.cache/uv \
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
COPY tools/install_gdrcopy.sh install_gdrcopy.sh
RUN set -eux; \
RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=/tmp/install_gdrcopy.sh,ro \
set -eux; \
case "${TARGETPLATFORM}" in \
linux/arm64) UUARCH="aarch64" ;; \
linux/amd64) UUARCH="x64" ;; \
*) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
esac; \
./install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"; \
rm ./install_gdrcopy.sh
# Install EP kernels(pplx-kernels and DeepEP)
COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
ENV CUDA_HOME=/usr/local/cuda
RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a 10.0a+PTX}" \
&& bash install_python_libraries.sh
/tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"
# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
# return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
@ -415,6 +460,11 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y git
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
@ -455,12 +505,11 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500
COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
uv pip install --system -r requirements/kv_connectors.txt; \
uv pip install --system -r /tmp/kv_connectors.txt; \
fi; \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
BITSANDBYTES_VERSION="0.42.0"; \

View File

@ -76,34 +76,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/common.txt
# must put before installing xformers, so it can install the correct version of xfomrers.
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Build xformers with cuda and torch nightly
# following official xformers guidance: https://github.com/facebookresearch/xformers#build
# todo(elainewy): cache xformers build result for faster build
ARG max_jobs=16
ENV MAX_JOBS=${max_jobs}
ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/uv \
echo 'git clone xformers...' \
&& git clone https://github.com/facebookresearch/xformers.git --recursive \
&& cd xformers \
&& git checkout ${XFORMERS_COMMIT} \
&& git submodule update --init --recursive \
&& echo 'finish git clone xformers...' \
&& rm -rf build \
&& python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
&& cd .. \
&& rm -rf xformers
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system xformers-dist/*.whl --verbose
# build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
@ -233,11 +205,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system vllm-dist/*.whl --verbose
# install xformers again for the new environment
RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
# install package for build flashinfer
@ -307,7 +274,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/nightly_torch_test.txt
# Logging to confirm the torch versions
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
RUN pip freeze | grep -E 'torch|vllm|flashinfer'
# Logging to confirm all the packages are installed
RUN pip freeze

View File

@ -1,4 +1,4 @@
FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
@ -25,10 +25,14 @@ RUN apt clean && apt-get update -y && \
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing
RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc
# This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.6/intel-oneccl-2021.15.6.9_offline.sh
RUN bash intel-oneccl-2021.15.6.9_offline.sh -a --silent --eula accept && \
echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \
echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
SHELL ["bash", "-c"]
CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
@ -72,6 +76,7 @@ RUN python3 -m pip install -e tests/vllm_test_utils
ENV NIXL_VERSION=0.7.0
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
# remove torch bundled oneccl to avoid conflicts
RUN --mount=type=cache,target=/root/.cache/pip \
pip uninstall oneccl oneccl-devel -y

Binary file not shown.

Before

Width:  |  Height:  |  Size: 119 KiB

After

Width:  |  Height:  |  Size: 146 KiB

View File

@ -49,9 +49,6 @@ llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
!!! warning
CUDA graph capture takes up more memory in V1 than in V0.
You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
??? code

View File

@ -31,9 +31,7 @@ In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as re
Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
In vLLM V1, **chunked prefill is always enabled by default**. This is different from vLLM V0, where it was conditionally enabled based on model characteristics.
With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
In V1, **chunked prefill is enabled by default whenever possible**. With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
This policy has two benefits:

View File

@ -98,21 +98,6 @@ to warm it up so that future builds are faster.
<img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
</p>
## Update dependencies
Several vLLM dependencies like xFormers depend on PyTorch and need
to be updated accordingly. Rather than waiting for all of them to publish new
releases (which would take too much time), they can be built from
source to unblock the update process.
### xFormers
```bash
export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
MAX_JOBS=16 uv pip install --system \
--no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
```
## Update all the different vLLM platforms
Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable

View File

@ -2,4 +2,4 @@
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.

View File

@ -9,7 +9,7 @@ TL;DR:
|----------|----------|-------------|
| --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs |
| -O.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only |
| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(mode=CompilationMode.NONE) | Turn off CUDAGraphs only |
| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only |
| -O.backend=eager | compilation_config=CompilationConfig(backend='eager') | Turn off TorchInductor |
## vLLM-torch.compile overview
@ -151,6 +151,76 @@ To avoid this, please either:
2. wrap the branching logic into a custom operator. TorchDynamo does not
trace into custom operators.
## Debugging constraint violations and dynamic shapes guards issues
Dynamic-shape guards are a specific category of Dynamo guards. They are constraints that `torch.compile`
attaches to dynamic dimensions (e.g., `seq_len`) to ensure the compiled artifact remains valid.
These guards typically appear when framework code, custom passes, or user code branches based on
dynamic shape values.
**Example:**
```python
if x > 10:
# path A
else:
# path B
```
This creates a guard `x > 10` or `x <= 10` depending on which path was traced.
**vLLM's Assumption:**
vLLM assumes that all guards added by torch.compile are safe to drop and will not
constrain the compiled graph to specific input shapes. When this assumption is violated,
it can cause issues that users need to debug.
Some side effects that indicates this assumption is violated are runtime errors
or `ConstraintViolationErrors`.
A `ConstraintViolationErrors` will be thrown if a dynamic shape gets constrained to
a single value. If you encounter a constraint violation error or suspect that a dynamic
shapes guard is being added incorrectly, you can use stricter dynamic shape modes to
help debug the issue:
```sh
# Online - using unbacked mode
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
# Online - using backed_size_oblivious mode
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=backed_size_oblivious
```
```py
# Offline - using unbacked mode
from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
LLM(model, compilation_config=CompilationConfig(
dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.UNBACKED)
))
# Offline - using backed_size_oblivious mode
from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
LLM(model, compilation_config=CompilationConfig(
dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS)
))
```
These modes are stricter and reduce or eliminate the need of dynamic shapes guarding, which can help isolate issues:
- `unbacked`: Uses unbacked symints which don't allow guards, making it easier to identify where guards are being incorrectly added
- `backed_size_oblivious`: Uses a mode that is more strict about guarding.
For more details on dynamic shapes modes, see [Dynamic shapes and vLLM guard dropping](torch_compile.md#dynamic-shapes-and-vllm-guard-dropping).
### Printing guards
To see all guards that are being added during compilation, you can use `TORCH_LOGS=+dynamic`:
```sh
TORCH_LOGS=+dynamic vllm serve meta-llama/Llama-3.2-1B
```
Look for `[guard added]` in the logs to see where guards are being added. This can help you identify which operations are
causing guards to be added incorrectly.
## Debugging TorchInductor
TorchInductor takes a captured graph and then compiles it down to some Python code

View File

@ -60,7 +60,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
- [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
- [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod]
- [`CompressedTensorsW4A4Nvfp4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoeMethod]
- [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
- [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
- [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]

View File

@ -29,6 +29,109 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all
By default, the cache saves compiled artifacts as binary files. If you would like to interact with the generated code for debugging purposes, set the field `compile_cache_save_format=unpacked` in the compilation config, or omit this and set the env variable `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`.
## Dynamic shapes and vllm guard dropping
`torch.compile` is designed to guard on dynamic shapes with no hesitation
when needed. This contradicts with vLLM's `torch.compile` approach of
dropping the guards since many of those guards could be material.
`torch.compile` provides two kinds of dynamic shapes: `backed` and `unbacked`.
`torch.compile` guards on `backed` dynamic shapes and does not provide a
guarantee that no guards will be added to them. User code, dynamo,
inductor, and autograd all can add guards. Moreover, for 0/1
specializations, backed symbols are specialized unconditionally to 0, 1,
or >=2 even without encountering a branching on those ranges.
On the contrary, `unbacked` dynamic shapes are guaranteed not to be guarded
on and are not 0/1 specialized. However, there is a possibility of
throwing a data dependent error when a branch that requires their value is
encountered and no explicit unbacked handling is defined. The framework is
converging to a state where it won't throw DDE but rather pick general
paths. One downside of using unbacked is missed optimization opportunities
due to either perf bugs or picking general paths, also using a fixed
non-example input-based hint (this will be fixed soon with override_hint
API). An example of picking general paths is assuming input not contiguous
in functions call contiguous() and reshape() when can't be symbolically proven
with a change of introducing a clone.
`backed_size_oblivious` is a flag that enables treating backed symbols as
unbacked wherever explicit handling for unbacked is defined. With this
mode, 0/1 specializations are mostly avoided in framework code and the
default 0/1 specialization does not happen. However, there is still no
guarantee that torch.compile won't guard, especially due to user code or
custom passes. `backed_size_oblivious` is experimental in PyTorch compile
and could be deprecated. That said, it's a safer option to use than
`backed` and the probability of reducing performance is lower than
`unbacked`.
### Configuring Dynamic Shapes
The `DynamicShapesConfig` allows you to control the dynamic shapes behavior by
setting the `type` field. You can choose between three modes:
`BACKED`(default), `UNBACKED` , and `BACKED_SIZE_OBLIVIOUS`.
#### Offline Inference Example (Using LLM class)
When using the `LLM` class for offline inference, you can configure dynamic
shapes through the `compilation_config` parameter:
```python
from vllm import LLM, SamplingParams
from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
# Example: Using backed_size_oblivious (experimental, safer than backed)
llm = LLM(
model="meta-llama/Llama-3.2-1B",
compilation_config=CompilationConfig(
dynamic_shapes_config=DynamicShapesConfig(
type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS
)
)
)
# Example: Using unbacked (strongest guarantee against guards)
llm = LLM(
model="meta-llama/Llama-3.2-1B",
compilation_config=CompilationConfig(
dynamic_shapes_config=DynamicShapesConfig(
type=DynamicShapesType.UNBACKED
)
)
)
# Generate outputs
prompts = ["Hello, my name is", "The future of AI is"]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
outputs = llm.generate(prompts, sampling_params)
```
#### Online Serving Example (Using vllm serve)
When using `vllm serve` for online serving, you can configure dynamic shapes
through the `--compilation-config` flag:
```bash
# Example: Using unbacked
vllm serve meta-llama/Llama-3.2-1B \
--compilation-config '{"dynamic_shapes_config": {"type": "unbacked"}}'
# Alternative: Using dot notation (simpler for single values)
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
```
#### Choosing the Right Mode
- **BACKED** (default): Use when you're willing to accept potential unsafe dropping of guards
for maximal performance. Guard could be unsoundly added and then ignored.
- **UNBACKED** Use when you need the strongest guarantee against guards.
This is the most conservative option but may miss some optimization opportunities.
- **BACKED_SIZE_OBLIVIOUS**: Use when you want a balance between avoiding guards
and performance. This experimental mode is safer than BACKED but still not as
conservative as UNBACKED.
## Python Code Compilation
In the very verbose logs, we can see:
@ -122,7 +225,7 @@ When all the shapes are known, `torch.compile` can compare different configs, an
triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
mm 0.0160 ms 81.6%
mm 0.0160 ms 81.6%
triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2

View File

@ -22,9 +22,6 @@ export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxab
vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
```
!!! tip
If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
!!! tip
When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
`VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.

View File

@ -7,7 +7,7 @@ This document shows you some examples of the different options that are
available to generate structured outputs.
!!! warning
If you are still using the following deprecated API fields, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
If you are still using the following deprecated API fields which were removed in v0.12.0, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
- `guided_json` -> `{"structured_outputs": {"json": ...}}` or `StructuredOutputsParams(json=...)`
- `guided_regex` -> `{"structured_outputs": {"regex": ...}}` or `StructuredOutputsParams(regex=...)`

View File

@ -283,7 +283,7 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options:
- On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
- On NVIDIA CUDA: `FLASH_ATTN` or `FLASHINFER`.
- On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.
For AMD ROCm, you can further control the specific Attention implementation using the following variables:

View File

@ -680,6 +680,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ |
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
@ -701,6 +702,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ |
| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ |
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
| `OpenCUAForConditionalGeneration` | OpenCUA-7B | T + I<sup>E+</sup> | `xlangai/OpenCUA-7B` | ✅︎ | ✅︎ |
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
| `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |

View File

@ -49,7 +49,8 @@ We currently support the following OpenAI APIs:
- *Note: `suffix` parameter is not supported.*
- [Chat Completions API](#chat-api) (`/v1/chat/completions`)
- Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
- *Note: `parallel_tool_calls` and `user` parameters are ignored.*
- *Note: `user` parameter is ignored.*
- *Note:* Setting the `parallel_tool_calls` parameter to `false` ensures vLLM only returns zero or one tool call per request. Setting it to `true` (the default) allows returning more than one tool call per request. There is no guarantee more than one tool call will be returned if this is set to `true`, as that behavior is model dependent and not all models are designed to support parallel tool calls.
- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
- Only applicable to [embedding models](../models/pooling_models.md).
- [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)

View File

@ -118,14 +118,16 @@ The common practice is to set the tensor parallel size to the number of GPUs in
```bash
vllm serve /path/to/the/model/in/the/container \
--tensor-parallel-size 8 \
--pipeline-parallel-size 2
--pipeline-parallel-size 2 \
--distributed-executor-backend ray
```
Alternatively, you can set `tensor_parallel_size` to the total number of GPUs in the cluster:
```bash
vllm serve /path/to/the/model/in/the/container \
--tensor-parallel-size 16
--tensor-parallel-size 16 \
--distributed-executor-backend ray
```
## Optimizing network communication for tensor parallelism

View File

@ -1,21 +1,23 @@
# Reproducibility
vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. To achieve
reproducible results, you need to turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
reproducible results:
- In offline mode, you can either set `VLLM_ENABLE_V1_MULTIPROCESSING=0` which makes scheduling deterministic,
or enable [batch invariance](../features/batch_invariance.md) to make the outputs insensitive to scheduling.
- In online mode, you can only enable [batch invariance](../features/batch_invariance.md).
Example: [examples/offline_inference/reproducibility.py](../../examples/offline_inference/reproducibility.py)
!!! warning
Applying the above settings [changes the random state in user code](#locality-of-random-state).
Setting `VLLM_ENABLE_V1_MULTIPROCESSING=0` will change the random state of user code
(i.e. the code that constructs [LLM][vllm.LLM] class).
!!! note
Even with the above settings, vLLM only provides reproducibility
when it runs on the same hardware and the same vLLM version.
Also, the online serving API (`vllm serve`) does not support reproducibility
because it is almost impossible to make the scheduling deterministic in the
online setting.
## Setting the global seed
@ -23,25 +25,17 @@ The `seed` parameter in vLLM is used to control the random states for various ra
If a specific seed value is provided, the random states for `random`, `np.random`, and `torch.manual_seed` will be set accordingly.
However, in some cases, setting the seed will also [change the random state in user code](#locality-of-random-state).
### Default Behavior
In V1, the `seed` parameter defaults to `0` which sets the random state for each worker, so the results will remain consistent for each vLLM run even if `temperature > 0`.
It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs
for workflows such as speculative decoding. For more information, see: <https://github.com/vllm-project/vllm/pull/17929>
!!! note
It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs
for workflows such as speculative decoding.
For more information, see: <https://github.com/vllm-project/vllm/pull/17929>
The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM
only if the workers are run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
### Locality of random state
The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM under the following conditions:
- For V0: The seed is specified.
- For V1: The workers are run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
By default, these conditions are not active so you can use vLLM without having to worry about
accidentally making deterministic subsequent operations that rely on random state.
By default, `VLLM_ENABLE_V1_MULTIPROCESSING=1` so you can use vLLM without having to worry about
accidentally making deterministic subsequent operations that rely on random state.

View File

@ -4,9 +4,7 @@
We have fully deprecated V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
## Why vLLM V1?
If you have a use case that works on V0 Engine but not V1, please share it on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
@ -32,16 +30,44 @@ Upgrade to vLLMs Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha-
This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1.
## Current Status
## Differences from V0
For each item, our progress towards V1 support falls into one of the following states:
This section lists some differences in behavior between V0 and V1.
- **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
- **🟢 Functional**: Fully operational, with ongoing optimizations.
- **🚧 WIP**: Under active development.
- **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).
- **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later.
- **🔴 Deprecated**: Not planned for V1 unless there is strong demand.
### Chunked Prefill
Chunked prefill is enabled by default whenever possible, unlike in V0 where it was conditionally enabled based on model characteristics.
### CUDA Graphs
CUDA graph capture takes up more memory in V1 than in V0.
### Semantic Changes to Logprobs
#### Logprobs Calculation
By default, logprobs in V1 are now returned immediately once computed from the models raw output (i.e.
before applying any logits post-processing such as temperature scaling or penalty
adjustments). As a result, the returned logprobs do not reflect the final adjusted
probabilities used during sampling.
You can adjust this behavior by setting the `--logprobs-mode` flag.
Four modes are supported: `raw_logprobs` (default), `processed_logprobs`, `raw_logits`, `processed_logits`.
Raw means the values before applying any logit processors, like bad words.
Processed means the values after applying all processors, including temperature and top_k/top_p.
#### Prompt Logprobs with Prefix Caching
While V1 supports passing prompt logprobs with prefix caching enabled, it no longer caches the logprobs.
For a request requiring prompt logprobs, the engine will ignore the prefix cache and recompute the prefill of full prompt to generate the logprobs.
## Feature Support
For each item, its support in vLLM V1 falls into one of the following states:
- **🟢 Functional**: Fully operational with optimizations comparable to or better than V0.
- **🟡 In Progress**: Planned to be in vLLM V1, with open PRs/RFCs.
- **🔴 Removed**: Dropped from vLLM V1. Will only consider re-introducing if there is strong demand.
!!! note
vLLM V1s unified scheduler treats both prompt and output tokens the same
@ -57,13 +83,13 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
### Hardware
| Hardware | Status |
|------------|-----------------------------------------------|
| **NVIDIA** | <nobr>🚀</nobr> |
| **AMD** | <nobr>🟢</nobr> |
| Hardware | Status |
|------------------|-----------------------------------------------|
| **NVIDIA** | <nobr>🟢</nobr> |
| **AMD** | <nobr>🟢</nobr> |
| **INTEL GPU** | <nobr>🟢</nobr> |
| **TPU** | <nobr>🟢</nobr> |
| **CPU** | <nobr>🟢 (x86\_64/aarch64) 🟡 (MacOS) </nobr> |
| **TPU** | <nobr>🟢</nobr> |
| **CPU** | <nobr>🟢</nobr> |
!!! note
@ -78,23 +104,21 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
### Models
| Model Type | Status |
|-----------------------------|------------------------------------------------------------------------------------|
| **Decoder-only Models** | <nobr>🚀 Optimized</nobr> |
| **Encoder-Decoder Models** | <nobr>🟢 Whisper only</nobr> |
| **Embedding Models** | <nobr>🟢 Functional</nobr> |
| **Mamba Models** | <nobr>🟢 (Mamba-2), 🟢 (Mamba-1)</nobr> |
| **Multimodal Models** | <nobr>🟢 Functional</nobr> |
| Model Type | Status |
|-----------------------------|-------------------------------------------------------------------------|
| **Decoder-only Models** | <nobr>🟢</nobr> |
| **Encoder-Decoder Models** | <nobr>🟢 (Whisper), 🔴 (Others) </nobr> |
| **Pooling Models** | <nobr>🟢</nobr> |
| **Mamba Models** | <nobr>🟢</nobr> |
| **Multimodal Models** | <nobr>🟢</nobr> |
See below for the status of models that are not yet supported or have more features planned in V1.
#### Embedding Models
#### Pooling Models
The initial basic support is now functional.
Now fully supported, with prefix caching and chunked prefill newly available for last-pooling models.
Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249),
which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360)
to enable simultaneous generation and embedding using the same engine instance in V1.
We are working on enabling prefix caching and chunked prefill for more categories of pooling models.
#### Mamba Models
@ -112,24 +136,25 @@ Please note that prefix caching is not yet supported for any of the above models
Whisper is supported. Other models requiring cross-attention between separate
encoder and decoder (e.g., `BartForConditionalGeneration`,
`MllamaForConditionalGeneration`) are not supported.
`MllamaForConditionalGeneration`) are no longer supported.
### Features
| Feature | Status |
|---------------------------------------------|-----------------------------------------------------------------------------------|
| **Prefix Caching** | <nobr>🚀 Optimized</nobr> |
| **Chunked Prefill** | <nobr>🚀 Optimized</nobr> |
| **LoRA** | <nobr>🚀 Optimized</nobr> |
| **Prefix Caching** | <nobr>🟢 Functional</nobr> |
| **Chunked Prefill** | <nobr>🟢 Functional</nobr> |
| **LoRA** | <nobr>🟢 Functional</nobr> |
| **Logprobs Calculation** | <nobr>🟢 Functional</nobr> |
| **FP8 KV Cache** | <nobr>🟢 Functional on Hopper devices (<https://github.com/vllm-project/vllm/pull/15191>)</nobr>|
| **Spec Decode** | <nobr>🚀 Optimized</nobr> |
| **Prompt Logprobs with Prefix Caching** | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
| **FP8 KV Cache** | <nobr>🟢 Functional</nobr> |
| **Spec Decode** | <nobr>🟢 Functional</nobr> |
| **Prompt Logprobs with Prefix Caching** | <nobr>🟢 Functional</nobr> |
| **Structured Output Alternative Backends** | <nobr>🟢 Functional</nobr> |
| **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr> |
| **best_of** | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
| **Per-Request Logits Processors** | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
| **GPU <> CPU KV Cache Swapping** | <nobr>🔴 Deprecated</nobr> |
| **Concurrent Partial Prefills** | <nobr>🟡 [In Progress](https://github.com/vllm-project/vllm/issues/14003)</nobr> |
| **best_of** | <nobr>🔴 [Removed](https://github.com/vllm-project/vllm/issues/13361)</nobr> |
| **Per-Request Logits Processors** | <nobr>🔴 [Removed](https://github.com/vllm-project/vllm/pull/13360)</nobr> |
| **GPU <> CPU KV Cache Swapping** | <nobr>🔴 Removed</nobr> |
| **Request-level Structured Output Backend** | <nobr>🔴 Removed</nobr> |
!!! note
@ -139,37 +164,16 @@ encoder and decoder (e.g., `BartForConditionalGeneration`,
prefix caching, and speculative decoding without a strict separation between prefill
and decode phases.
#### Semantic Changes to Logprobs
#### Removed Features
vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
differences compared to V0:
##### Logprobs Calculation
By default, logprobs in V1 are now returned immediately once computed from the models raw output (i.e.
before applying any logits post-processing such as temperature scaling or penalty
adjustments). As a result, the returned logprobs do not reflect the final adjusted
probabilities used during sampling.
You can adjust this behavior by setting the `--logprobs-mode` flag.
Four modes are supported: `raw_logprobs` (default), `processed_logprobs`, `raw_logits`, `processed_logits`.
Raw means the values before applying any logit processors, like bad words.
Processed means the values after applying all processors, including temperature and top_k/top_p.
##### Prompt Logprobs with Prefix Caching
Logprobs are not cached. For a request requiring prompt logprobs, the engine will ignore the prefix cache and recompute the prefill of full prompt to generate the logprobs.
#### Deprecated Features
As part of the major architectural rework in vLLM V1, several legacy features have been deprecated.
As part of the major architectural rework in vLLM V1, several legacy features have been removed.
##### Sampling features
- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
- **best_of**: This feature has been removed due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
- **Per-Request Logits Processors**: In V0, users could pass custom
processing functions to adjust logits on a per-request basis. In vLLM V1, this
feature has been deprecated. Instead, we now support **global logits processors**
feature has been removed. Instead, we now support **global logits processors**
which are set at startup time, see [RFC #17799](https://github.com/vllm-project/vllm/issues/17799).
##### KV Cache features
@ -179,4 +183,4 @@ to handle request preemptions.
##### Structured Output features
- **Request-level Structured Output Backend**: Deprecated, alternative backends (outlines, guidance) with fallbacks is supported now.
- **Request-level Structured Output Backend**: Removed; alternative backends (outlines, guidance) with fallbacks are supported now.

15
examples/offline_inference/audio_language.py Normal file → Executable file
View File

@ -425,6 +425,13 @@ def parse_args():
default=None,
help="Set the seed when initializing `vllm.LLM`.",
)
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args()
@ -434,6 +441,12 @@ def main(args):
if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.")
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {args.tensor_parallel_size}"
)
audio_count = args.num_audios
req_data = model_example_map[model](
question_per_audio_count[audio_count], audio_count
@ -446,6 +459,8 @@ def main(args):
)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args)
# We set temperature to 0.2 so that outputs can be different

View File

@ -0,0 +1,170 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference
with the correct prompt format on Qwen2.5-Omni (thinker only).
"""
from typing import NamedTuple
from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.multimodal.image import convert_image_mode
from vllm.utils.argparse_utils import FlexibleArgumentParser
class QueryResult(NamedTuple):
inputs: dict
limit_mm_per_prompt: dict[str, int]
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
default_system = (
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
"Group, capable of perceiving auditory and visual inputs, as well as "
"generating text and speech."
)
def get_mixed_modalities_query() -> QueryResult:
question = (
"What is recited in the audio? "
"What is the content of this image? Why is this video funny?"
)
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
"<|vision_start|><|image_pad|><|vision_end|>"
"<|vision_start|><|video_pad|><|vision_end|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
"image": convert_image_mode(
ImageAsset("cherry_blossom").pil_image, "RGB"
),
"video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
},
},
limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1},
)
def get_use_audio_in_video_query() -> QueryResult:
question = (
"Describe the content of the video in details, then convert what the "
"baby say into text."
)
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
asset = VideoAsset(name="baby_reading", num_frames=16)
audio = asset.get_audio(sampling_rate=16000)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"video": asset.np_ndarrays,
"audio": audio,
},
"mm_processor_kwargs": {
"use_audio_in_video": True,
},
},
limit_mm_per_prompt={"audio": 1, "video": 1},
)
def get_multi_audios_query() -> QueryResult:
question = "Are these two audio clips the same?"
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
"<|audio_start|><|audio_pad|><|audio_end|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"audio": [
AudioAsset("winning_call").audio_and_sample_rate,
AudioAsset("mary_had_lamb").audio_and_sample_rate,
],
},
},
limit_mm_per_prompt={
"audio": 2,
},
)
query_map = {
"mixed_modalities": get_mixed_modalities_query,
"use_audio_in_video": get_use_audio_in_video_query,
"multi_audios": get_multi_audios_query,
}
def main(args):
model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
query_result = query_map[args.query_type]()
llm = LLM(
model=model_name,
max_model_len=12800,
max_num_seqs=5,
limit_mm_per_prompt=query_result.limit_mm_per_prompt,
seed=args.seed,
)
# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2, max_tokens=256)
outputs = llm.generate(query_result.inputs, sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def parse_args():
parser = FlexibleArgumentParser(
description="Demo on using vLLM for offline inference with "
"audio language models"
)
parser.add_argument(
"--query-type",
"-q",
type=str,
default="mixed_modalities",
choices=query_map.keys(),
help="Query type.",
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@ -11,8 +11,11 @@ import random
from vllm import LLM, SamplingParams
# Turn off multiprocessing to make the scheduling deterministic.
# Either:
## Turn off multiprocessing to make the scheduling deterministic, or
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
## Enable batch invariance to get consistent results regardless of scheduling.
os.environ["VLLM_BATCH_INVARIANT"] = "1"
prompts = [
"Hello, my name is",

41
examples/offline_inference/vision_language.py Normal file → Executable file
View File

@ -538,6 +538,31 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
)
# HunyuanOCR
def run_hunyuan_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "tencent/HunyuanOCR"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
limit_mm_per_prompt={modality: 1},
)
placeholder = "<hy_place▁holder▁no▁100><hy_place▁holder▁no▁102><hy_place▁holder▁no▁101>" # noqa: E501
prompts = [
f"<hy_begin▁of▁sentence>{placeholder}{question}<hy_User>"
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=None,
)
# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
def run_hyperclovax_seed_vision(
questions: list[str], modality: str
@ -1820,6 +1845,7 @@ model_example_map = {
"glm4_5v": run_glm4_5v,
"glm4_5v_fp8": run_glm4_5v_fp8,
"h2ovl_chat": run_h2ovl,
"hunyuan_vl": run_hunyuan_vl,
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
"idefics3": run_idefics3,
"interns1": run_interns1,
@ -2038,6 +2064,13 @@ def parse_args():
help="If True, will send all requests in a second batch with empty mm "
"data to verify cache hits with UUIDs.",
)
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args()
@ -2046,6 +2079,12 @@ def main(args):
if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.")
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {args.tensor_parallel_size}"
)
modality = args.modality
mm_input = get_multi_modal_input(args)
data = mm_input["data"]
@ -2063,6 +2102,8 @@ def main(args):
"seed": args.seed,
"mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
}
if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args)
# Don't want to check the flag multiple times, so just hijack `prompts`.

View File

@ -1110,6 +1110,7 @@ def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model=model_name,
max_model_len=16384,
max_num_seqs=16,
trust_remote_code=True,
limit_mm_per_prompt={"image": len(image_urls)},
)
@ -1351,10 +1352,18 @@ model_example_map = {
}
def run_generate(model, question: str, image_urls: list[str], seed: int | None):
def run_generate(
model,
question: str,
image_urls: list[str],
seed: int | None,
tensor_parallel_size: int | None,
):
req_data = model_example_map[model](question, image_urls)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
engine_args = asdict(req_data.engine_args) | {"seed": seed}
if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size
llm = LLM(**engine_args)
sampling_params = SamplingParams(
@ -1377,7 +1386,13 @@ def run_generate(model, question: str, image_urls: list[str], seed: int | None):
print("-" * 50)
def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
def run_chat(
model: str,
question: str,
image_urls: list[str],
seed: int | None,
tensor_parallel_size: int | None,
):
req_data = model_example_map[model](question, image_urls)
# Disable other modalities to save memory
@ -1387,6 +1402,8 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None)
)
engine_args = asdict(req_data.engine_args) | {"seed": seed}
if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size
llm = LLM(**engine_args)
sampling_params = (
@ -1462,6 +1479,13 @@ def parse_args():
default=2,
help="Number of images to use for the demo.",
)
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args()
@ -1469,13 +1493,20 @@ def main(args: Namespace):
model = args.model_type
method = args.method
seed = args.seed
tensor_parallel_size = args.tensor_parallel_size
if tensor_parallel_size is not None and tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {tensor_parallel_size}"
)
image_urls = IMAGE_URLS[: args.num_images]
if method == "generate":
run_generate(model, QUESTION, image_urls, seed)
run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size)
elif method == "chat":
run_chat(model, QUESTION, image_urls, seed)
run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size)
else:
raise ValueError(f"Invalid method: {method}")

View File

@ -25,25 +25,17 @@ import gradio as gr
from openai import OpenAI
def format_history_to_openai(history):
history_openai_format = [
{"role": "system", "content": "You are a great AI assistant."}
]
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human})
history_openai_format.append({"role": "assistant", "content": assistant})
return history_openai_format
def predict(message, history, client, model_name, temp, stop_token_ids):
# Format history to OpenAI chat format
history_openai_format = format_history_to_openai(history)
history_openai_format.append({"role": "user", "content": message})
messages = [
{"role": "system", "content": "You are a great AI assistant."},
*history,
{"role": "user", "content": message},
]
# Send request to OpenAI API (vLLM server)
stream = client.chat.completions.create(
model=model_name,
messages=history_openai_format,
messages=messages,
temperature=temp,
stream=True,
extra_body={

View File

@ -22,7 +22,6 @@ API_KEY=${API_KEY:-"your-api-key"}
POOLING_TYPE=${POOLING_TYPE:-"auto"} # auto, MEAN, CLS, LAST
export VLLM_ENABLE_CHUNKED_PROCESSING=true
export CUDA_VISIBLE_DEVICES=2,3,4,5
# export VLLM_ATTENTION_BACKEND=XFORMERS
echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing"
echo "=================================================================="

View File

@ -0,0 +1,44 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Set up this example by starting a vLLM OpenAI-compatible server.
Reasoning models can be used through the Responses API as seen here
https://platform.openai.com/docs/api-reference/responses
For example:
vllm serve Qwen/Qwen3-8B --reasoning-parser qwen3
"""
from openai import OpenAI
input_messages = [{"role": "user", "content": "What model are you?"}]
def main():
base_url = "http://localhost:8000/v1"
client = OpenAI(base_url=base_url, api_key="empty")
model = "Qwen/Qwen3-8B" # get_first_model(client)
response = client.responses.create(
model=model,
input=input_messages,
)
for message in response.output:
if message.type == "reasoning":
# append reasoning message
input_messages.append(message)
response_2 = client.responses.create(
model=model,
input=input_messages,
)
print(response_2.output_text)
# I am Qwen, a large language model developed by Alibaba Cloud.
# I am designed to assist with a wide range of tasks, including
# answering questions, creating content, coding, and engaging in
# conversations. I can help with various topics and provide
# information or support in multiple languages. How can I assist you today?
if __name__ == "__main__":
main()

View File

@ -9,6 +9,5 @@ torch==2.9.0
torchaudio==2.9.0
# These must be updated alongside torch
torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9
# FlashInfer should be updated together with the Dockerfile
flashinfer-python==0.5.2

View File

@ -1,2 +1,2 @@
lmcache
nixl >= 0.6.0 # Required for disaggregated prefill
nixl >= 0.7.1 # Required for disaggregated prefill

View File

@ -39,3 +39,13 @@ mteb[bm25s]>=1.38.11, <2
# Required for eval tests
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
# Required for multiprocessed tests that use spawn method
multiprocess==0.70.16
# Plugins test
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
torchgeo==0.7.0
# Required for suffix decoding test
arctic-inference == 0.1.1

View File

@ -10,9 +10,9 @@ wheel
jinja2>=3.1.6
datasets # for benchmark scripts
numba == 0.61.2 # Required for N-gram speculative decoding
torch==2.8.0+xpu
--extra-index-url=https://download.pytorch.org/whl/xpu
torch==2.9.0+xpu
torchaudio
torchvision
--extra-index-url=https://download.pytorch.org/whl/xpu
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.9.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl

View File

@ -74,18 +74,6 @@ def is_ninja_available() -> bool:
return which("ninja") is not None
def is_url_available(url: str) -> bool:
from urllib.request import urlopen
status = None
try:
with urlopen(url) as f:
status = f.status
except Exception:
return False
return status == 200
class CMakeExtension(Extension):
def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
super().__init__(name, sources=[], py_limited_api=True, **kwa)
@ -533,28 +521,6 @@ def get_nvcc_cuda_version() -> Version:
return nvcc_cuda_version
def get_gaudi_sw_version():
"""
Returns the driver version.
"""
# Enable console printing for `hl-smi` check
output = subprocess.run(
"hl-smi",
shell=True,
text=True,
capture_output=True,
env={"ENABLE_CONSOLE": "true"},
)
if output.returncode == 0 and output.stdout:
return (
output.stdout.split("\n")[2]
.replace(" ", "")
.split(":")[1][:-1]
.split("-")[0]
)
return "0.0.0" # when hl-smi is not available
def get_vllm_version() -> str:
# Allow overriding the version. This is useful to build platform-specific
# wheels (e.g. CPU, TPU) without modifying the source.

View File

@ -74,9 +74,6 @@ def test_models(
model_executor: str,
enable_prompt_embeds: bool,
) -> None:
if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
pytest.skip(f"{backend} does not support gemma2 with full context length.")
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", backend)

View File

@ -111,6 +111,17 @@ if current_platform.is_cuda():
async_tp=96, # MLP is MoE, half the fusions of dense
),
),
ModelBackendTestCase(
model_name="openai/gpt-oss-20b",
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
backend=AttentionBackendEnum.FLASHINFER,
matches=Matches(
attention_fusion=0,
allreduce_fusion=49,
sequence_parallel=49,
async_tp=48,
),
),
]
elif current_platform.is_rocm():

View File

@ -55,7 +55,7 @@ class SillyModel(nn.Module):
def _run_simple_model(
splitting_ops,
use_inductor_graph_partition,
use_inductor,
backend,
expected_num_piecewise_graphs_seen,
expected_num_piecewise_capturable_graphs_seen,
expected_num_backend_compilations,
@ -64,7 +64,7 @@ def _run_simple_model(
vllm_config = VllmConfig(
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
use_inductor=use_inductor,
backend=backend,
splitting_ops=splitting_ops,
use_inductor_graph_partition=use_inductor_graph_partition,
cudagraph_copy_inputs=True,
@ -124,14 +124,14 @@ def _run_simple_model(
assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
@pytest.mark.parametrize("use_inductor", [True, False])
@pytest.mark.parametrize("backend", ["inductor", "eager"])
@torch.inference_mode()
@create_new_process_for_each_test("spawn")
def test_simple_piecewise_compile(use_inductor):
def test_simple_piecewise_compile(backend):
_run_simple_model(
splitting_ops=["silly::attention"],
use_inductor_graph_partition=False,
use_inductor=use_inductor,
backend=backend,
# 2 * num_layers + 1
expected_num_piecewise_graphs_seen=5,
# 1 + num_layers
@ -155,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch):
_run_simple_model(
splitting_ops=["silly::attention"],
use_inductor_graph_partition=True,
use_inductor=True,
backend="inductor",
# Since not splitting at fx graph level
expected_num_piecewise_graphs_seen=1,
# Since not splitting at fx graph level

View File

@ -0,0 +1,88 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
import pytest
import torch
from vllm import LLM, SamplingParams
from vllm.config.compilation import CompilationMode, DynamicShapesType
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.utils.torch_utils import is_torch_equal_or_newer
def get_test_models():
"""Get list of models to test based on PyTorch version"""
# TODO "Qwen/Qwen3-4B-Instruct-2507" fails Fix issue and support it.
return ["gpt2", "Qwen/Qwen2-7B-Instruct", "meta-llama/Llama-3.1-8B"]
@pytest.mark.parametrize("model_name", get_test_models())
@pytest.mark.parametrize(
"shapes_type",
[
DynamicShapesType.BACKED,
DynamicShapesType.UNBACKED,
DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
],
)
@pytest.mark.parametrize("use_aot_compile", ["0"])
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
@pytest.mark.skipif(
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
)
def test_dynamic_shapes_compilation(
monkeypatch, model_name, shapes_type, use_aot_compile, use_bytecode_hook
):
"""Test that all dynamic shapes types compile successfully"""
print(
f"\nTesting model: {model_name} with {shapes_type.name}, "
f"AOT compile: {use_aot_compile}, "
f"Bytecode hook: {use_bytecode_hook}"
)
if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
prompt = "Hello, my name is"
print(f"Testing {shapes_type.name} dynamic shapes...")
# Initialize the model with specific dynamic shapes configuration
model = LLM(
model=model_name,
compilation_config={
"mode": CompilationMode.VLLM_COMPILE,
"dynamic_shapes_config": {
"type": shapes_type.value,
},
},
)
output = model.generate(prompt)
result = output[0].outputs[0].text
# Example of setting the sampling parameters
tokenizer = get_tokenizer(model_name)
yes_tokens = tokenizer.encode("yes", add_special_tokens=False)
no_tokens = tokenizer.encode("no", add_special_tokens=False)
allowed_ids = list(set(yes_tokens + no_tokens))
sampling_params = SamplingParams(
max_tokens=1, temperature=0, allowed_token_ids=allowed_ids
)
output = model.generate(
"answer with yes or no is " + result + " rubbish for prompt " + prompt + "?",
sampling_params=sampling_params,
)
result = output[0].outputs[0].text
assert result == "yes"
# Clean up GPU memory
del model
gc.collect()
torch.cuda.empty_cache()
torch.cuda.synchronize()
print("GPU memory cleared")

View File

@ -748,6 +748,14 @@ class VllmRunner:
# being captured which can trigger edge cases that we don't handle yet.
kwargs["compilation_config"] = {"cudagraph_capture_sizes": [4]}
# Make sure we have atleast one cudagraph large enough for a single decode.
if (speculative_config := kwargs.get("speculative_config")) and (
num_speculative_tokens := speculative_config["num_speculative_tokens"]
):
kwargs["compilation_config"]["cudagraph_capture_sizes"].append(
num_speculative_tokens + 1
)
with init_ctx:
self.llm = LLM(
model=model_name,
@ -845,6 +853,7 @@ class VllmRunner:
@staticmethod
def _final_steps_generate_w_logprobs(
req_outputs: list[RequestOutput],
include_prompt_token_ids: bool = False,
) -> list[TokensTextLogprobsPromptLogprobs]:
outputs: list[TokensTextLogprobsPromptLogprobs] = []
for req_output in req_outputs:
@ -853,9 +862,26 @@ class VllmRunner:
output_str = sample.text
output_ids = list(sample.token_ids)
output_logprobs = sample.logprobs
outputs.append(
(output_ids, output_str, output_logprobs, req_output.prompt_logprobs)
)
if include_prompt_token_ids:
outputs.append(
( # type: ignore[arg-type]
output_ids,
output_str,
output_logprobs,
req_output.prompt_token_ids,
req_output.prompt_logprobs,
)
)
else:
outputs.append(
(
output_ids,
output_str,
output_logprobs,
req_output.prompt_logprobs,
)
)
return outputs
def generate_w_logprobs(
@ -865,6 +891,7 @@ class VllmRunner:
images: PromptImageInput | None = None,
audios: PromptAudioInput | None = None,
videos: PromptVideoInput | None = None,
include_prompt_token_ids: bool = False,
**kwargs: Any,
) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
@ -874,7 +901,7 @@ class VllmRunner:
)
toks_str_logsprobs_prompt_logprobs = self._final_steps_generate_w_logprobs(
req_outputs
req_outputs, include_prompt_token_ids
)
# Omit prompt logprobs if not required by sampling params
return (

View File

@ -0,0 +1,49 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import random
import torch
import torch.multiprocessing as mp
from vllm.distributed.parallel_state import (
init_distributed_environment,
)
from vllm.utils.system_utils import update_environment_variables
mp.set_start_method("spawn", force=True)
def distributed_run(fn, world_size, *args):
number_of_processes = world_size
processes: list[mp.Process] = []
for i in range(number_of_processes):
env: dict[str, str] = {}
env["RANK"] = str(i)
env["LOCAL_RANK"] = str(i)
env["WORLD_SIZE"] = str(number_of_processes)
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
env["MASTER_ADDR"] = "localhost"
env["MASTER_PORT"] = "12345"
p = mp.Process(target=fn, args=(env, world_size, *args))
processes.append(p)
p.start()
for p in processes:
p.join()
for p in processes:
assert p.exitcode == 0
def set_env_vars_and_device(env: dict[str, str]) -> None:
update_environment_variables(env)
local_rank = os.environ["LOCAL_RANK"]
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
init_distributed_environment()
# Ensure each worker process has the same random seed
random.seed(42)
torch.manual_seed(42)

View File

@ -1,57 +1,24 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import asyncio
import random
import pytest
import torch
import torch.distributed
import torch.multiprocessing as mp
from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
from vllm.distributed.eplb.rebalance_execute import (
move_from_buffer,
rearrange_expert_weights_inplace,
transfer_layer,
)
from vllm.distributed.parallel_state import (
ensure_model_parallel_initialized,
get_tp_group,
init_distributed_environment,
)
from vllm.utils.system_utils import update_environment_variables
mp.set_start_method("spawn", force=True)
def distributed_run(fn, world_size, *args):
number_of_processes = world_size
processes: list[mp.Process] = []
for i in range(number_of_processes):
env: dict[str, str] = {}
env["RANK"] = str(i)
env["LOCAL_RANK"] = str(i)
env["WORLD_SIZE"] = str(number_of_processes)
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
env["MASTER_ADDR"] = "localhost"
env["MASTER_PORT"] = "12345"
p = mp.Process(target=fn, args=(env, world_size, *args))
processes.append(p)
p.start()
for p in processes:
p.join()
for p in processes:
assert p.exitcode == 0
def set_env_vars_and_device(env: dict[str, str]) -> None:
update_environment_variables(env)
local_rank = os.environ["LOCAL_RANK"]
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
init_distributed_environment()
# Ensure each worker process has the same random seed
random.seed(42)
torch.manual_seed(42)
from .eplb_utils import distributed_run, set_env_vars_and_device
def create_expert_indices_with_redundancy(
@ -269,6 +236,100 @@ def verify_redundant_experts_have_same_weights(
)
def _test_async_transfer_layer_without_mtp_worker(
env,
world_size: int,
num_layers: int,
num_local_experts: int,
num_logical_experts: int,
) -> None:
set_env_vars_and_device(env)
ensure_model_parallel_initialized(
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
)
tp_group = get_tp_group()
ep_group = tp_group.device_group
ep_rank = torch.distributed.get_rank()
device = torch.device(f"cuda:{ep_rank}")
total_physical_experts = world_size * num_local_experts
hidden_sizes = [16, 32]
redundancy_config = create_redundancy_config(
num_logical_experts,
total_physical_experts,
)
old_indices = create_expert_indices_with_redundancy(
num_layers,
num_logical_experts,
total_physical_experts,
redundancy_config,
)
new_redundancy_config = create_redundancy_config(
num_logical_experts,
total_physical_experts,
)
new_indices = create_expert_indices_with_redundancy(
num_layers,
num_logical_experts,
total_physical_experts,
new_redundancy_config,
)
expert_weights = create_expert_weights(
num_layers,
num_local_experts,
hidden_sizes,
ep_rank,
device,
old_indices,
)
expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
cuda_stream = torch.cuda.Stream(device=device)
for layer_idx in range(num_layers):
is_unchanged, is_received_locally, experts_recv_loc = asyncio.run(
transfer_layer(
old_global_expert_indices=old_indices,
new_global_expert_indices=new_indices,
expert_weights=expert_weights,
expert_weights_buffer=expert_buffer,
ep_group=ep_group,
layer=layer_idx,
cuda_stream=cuda_stream,
)
)
cuda_stream.synchronize()
move_from_buffer(
expert_weights=expert_weights[layer_idx],
expert_weights_buffer=expert_buffer,
is_unchanged=is_unchanged,
is_received_locally=is_received_locally,
experts_recv_loc=experts_recv_loc,
new_indices=new_indices[layer_idx].tolist(),
ep_group=ep_group,
)
verify_expert_weights_after_shuffle(
expert_weights,
new_indices,
hidden_sizes,
ep_rank,
num_local_experts,
)
verify_redundant_experts_have_same_weights(
expert_weights,
new_indices,
hidden_sizes,
world_size,
num_local_experts,
)
def _test_rearrange_expert_weights_with_redundancy(
env, world_size, num_layers, num_local_experts, num_logical_experts
) -> None:
@ -437,6 +498,32 @@ def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
)
@pytest.mark.parametrize(
"world_size,num_layers,num_local_experts,num_logical_experts",
[
(2, 2, 2, 3),
],
)
def test_async_transfer_layer_without_mtp(
world_size: int,
num_layers: int,
num_local_experts: int,
num_logical_experts: int,
):
"""Exercise async EPLB transfer path without MTP/spec decode."""
if torch.cuda.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(
_test_async_transfer_layer_without_mtp_worker,
world_size,
num_layers,
num_local_experts,
num_logical_experts,
)
@pytest.mark.parametrize("world_size", [2, 4])
def test_rearrange_expert_weights_no_change(world_size):
"""

View File

@ -0,0 +1,285 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Test that the interaction between EPLB and FusedMoE Layer is okay
from dataclasses import dataclass
import pytest
import torch
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
from vllm.distributed.parallel_state import (
ensure_model_parallel_initialized,
get_tp_group,
)
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from .eplb_utils import distributed_run, set_env_vars_and_device
@dataclass
class TestConfig:
num_layers: int
num_experts: int
num_local_experts: int
num_topk: int
hidden_size: int
intermediate_size: int
weight_dtype: torch.dtype
weight_scale_dtype: torch.dtype | None
column_major_scales: bool
def make_expert_weights(
layer_idx: int,
global_expert_idx: int,
global_num_experts: int,
tensor_shape: tuple[int, ...],
tensor_dtype: torch.dtype,
tensor_device: torch.device,
is_column_major: bool,
) -> torch.Tensor:
assert len(tensor_shape) == 2
if is_column_major:
tensor_shape = (tensor_shape[1], tensor_shape[0])
x = torch.empty(tensor_shape, dtype=tensor_dtype, device=tensor_device)
value_offset = (layer_idx * global_num_experts + global_expert_idx) * x.numel()
x.view(-1).copy_(
torch.arange(
value_offset,
value_offset + x.numel(),
dtype=tensor_dtype,
device=tensor_device,
)
)
if is_column_major:
x = torch.transpose(x, 1, 0)
assert not x.is_contiguous()
return x
def make_fused_moe_layer(
rank: int,
layer_idx: int,
test_config: TestConfig,
) -> FusedMoE:
fml = FusedMoE(
num_experts=test_config.num_experts,
top_k=test_config.num_topk,
hidden_size=test_config.hidden_size,
intermediate_size=test_config.intermediate_size,
prefix=f"dummy_layer_{layer_idx}",
activation="silu",
is_act_and_mul=True,
params_dtype=test_config.weight_dtype,
)
device = torch.device(f"cuda:{rank}")
from functools import partial
_make_expert_weights = partial(
make_expert_weights,
layer_idx=layer_idx,
global_num_experts=test_config.num_experts,
tensor_device=device,
)
assert isinstance(fml.w13_weight.data, torch.Tensor)
assert isinstance(fml.w2_weight.data, torch.Tensor)
fml.w13_weight.data = fml.w13_weight.data.to(device=device)
fml.w2_weight.data = fml.w2_weight.data.to(device=device)
w13_weight = fml.w13_weight.data
w2_weight = fml.w2_weight.data
assert w13_weight.size(0) == test_config.num_local_experts
for i in range(test_config.num_local_experts):
g_i = rank * test_config.num_local_experts + i
w13_weight_e = w13_weight[i]
w2_weight_e = w2_weight[i]
w13_weight_e.copy_(
_make_expert_weights(
global_expert_idx=g_i,
tensor_shape=w13_weight_e.shape,
tensor_dtype=w13_weight_e.dtype,
is_column_major=False,
)
)
w2_weight_e.copy_(
_make_expert_weights(
global_expert_idx=g_i,
tensor_shape=w2_weight_e.shape,
tensor_dtype=w2_weight_e.dtype,
is_column_major=False,
)
)
block_size = 16
def block_quant_scales_shape(
shape: tuple[int, ...], is_column_major: bool
) -> tuple[int, ...]:
assert len(shape) == 3
if not is_column_major:
return (shape[0], shape[1] // block_size, shape[2] // block_size)
else:
return (shape[0], shape[2] // block_size, shape[1] // block_size)
is_column_major = test_config.column_major_scales
w13_weight_scale_inv = torch.empty(
block_quant_scales_shape(w13_weight.shape, is_column_major),
dtype=test_config.weight_dtype,
device=device,
)
w2_weight_scale_inv = torch.empty(
block_quant_scales_shape(w2_weight.shape, is_column_major),
dtype=test_config.weight_dtype,
device=device,
)
for i in range(test_config.num_local_experts):
g_i = rank * test_config.num_local_experts + i
w13_s_e = w13_weight_scale_inv[i]
w2_s_e = w2_weight_scale_inv[i]
w13_s_e.copy_(
_make_expert_weights(
global_expert_idx=g_i,
tensor_shape=w13_s_e.shape,
tensor_dtype=w13_s_e.dtype,
# Fill data in row-major and then
# transpose if test_config requires col-major.
is_column_major=False,
)
)
w2_s_e.copy_(
_make_expert_weights(
global_expert_idx=g_i,
tensor_shape=w2_s_e.shape,
tensor_dtype=w2_s_e.dtype,
is_column_major=False,
)
)
if is_column_major:
w13_weight_scale_inv = torch.transpose(w13_weight_scale_inv, 1, 2)
w2_weight_scale_inv = torch.transpose(w2_weight_scale_inv, 1, 2)
assert not w13_weight_scale_inv.is_contiguous()
assert not w2_weight_scale_inv.is_contiguous()
# Add scales to the parameter list
fml.w13_weight_scale_inv = torch.nn.Parameter(
w13_weight_scale_inv, requires_grad=False
)
fml.w2_weight_scale_inv = torch.nn.Parameter(
w2_weight_scale_inv, requires_grad=False
)
return fml
def _test_eplb_fml(env, world_size: int, test_config: TestConfig):
# Initialize model parallel (using tensor parallel as an entrypoint
# to expert parallel)
set_env_vars_and_device(env)
vllm_config = VllmConfig()
vllm_config.parallel_config.tensor_parallel_size = world_size
vllm_config.parallel_config.enable_expert_parallel = True
with set_current_vllm_config(vllm_config):
ensure_model_parallel_initialized(
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
)
ep_group = get_tp_group().cpu_group
ep_rank = torch.distributed.get_rank()
fml_layers = [
make_fused_moe_layer(ep_rank, layer_idx, test_config)
for layer_idx in range(test_config.num_layers)
]
rank_expert_weights = [fml.get_expert_weights() for fml in fml_layers]
indices = torch.zeros(
test_config.num_layers, test_config.num_experts, dtype=torch.long
)
for lidx in range(test_config.num_layers):
indices[lidx] = torch.Tensor(range(test_config.num_experts))
shuffled_indices = torch.zeros_like(indices)
for lidx in range(test_config.num_layers):
shuffled_indices[lidx] = torch.randperm(test_config.num_experts)
rearrange_expert_weights_inplace(
indices,
shuffled_indices,
rank_expert_weights,
ep_group,
is_profile=False,
)
num_local_experts = test_config.num_local_experts
num_global_experts = test_config.num_experts
for lidx, fml in enumerate(fml_layers):
for name, w in fml.named_parameters():
for e in range(num_local_experts):
g_e = shuffled_indices[lidx][ep_rank * num_local_experts + e]
ref = make_expert_weights(
layer_idx=lidx,
global_expert_idx=int(g_e.item()),
global_num_experts=num_global_experts,
tensor_shape=w[e].shape,
tensor_dtype=w[e].dtype,
tensor_device=w[e].device,
is_column_major=not w[e].is_contiguous(),
)
assert w[e].shape == ref.shape and w[e].stride() == ref.stride(), (
f"w[{e}] {w[e].size()} {w[e].stride()} vs "
f"ref {ref.size()} {ref.stride()}"
)
torch.testing.assert_close(w[e], ref)
@pytest.mark.parametrize("world_size", [2])
@pytest.mark.parametrize("num_layers", [4])
@pytest.mark.parametrize("num_experts", [16])
@pytest.mark.parametrize("hidden_size", [256])
@pytest.mark.parametrize("intermediate_size", [256])
@pytest.mark.parametrize("column_major_scales", [True, False])
def test_eplb_fml(
world_size: int,
num_layers: int,
num_experts: int,
hidden_size: int,
intermediate_size: int,
column_major_scales: bool,
):
if torch.cuda.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
num_local_experts = num_experts // world_size
num_topk = 4
# The dtypes are fine as we are essentially just checking data-copies
weight_dtype = torch.bfloat16
weight_scale_dtype = torch.bfloat16
test_config = TestConfig(
num_layers=num_layers,
num_experts=num_experts,
num_local_experts=num_local_experts,
num_topk=num_topk,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
weight_dtype=weight_dtype,
weight_scale_dtype=weight_scale_dtype,
column_major_scales=column_major_scales,
)
distributed_run(
_test_eplb_fml,
world_size,
test_config,
)

View File

@ -10,10 +10,11 @@ from tests.utils import large_gpu_mark
def get_model_args(
model_name: str,
spec_model_name: str,
spec_model_name: str | None,
spec_method: str,
tp_size: int,
model_max_len: int,
use_async: bool = False,
) -> dict:
speculative_config = {
"method": spec_method,
@ -37,6 +38,8 @@ def get_model_args(
"enable_eplb": True,
"max_model_len": model_max_len,
}
if use_async:
model_args["eplb_config"] = {"use_async": True}
return model_args
@ -94,3 +97,37 @@ def test_eplb_spec_decode(
measured_value - RTOL < expected_gsm8k_value
and measured_value + RTOL > expected_gsm8k_value
), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}"
@large_gpu_mark(min_gb=80)
def test_eplb_spec_decode_qwen3_next_mtp_async() -> None:
"""
Ensure async EPLB works with MTP speculative decoding for Qwen3-Next.
"""
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
expected_gsm8k_value = 0.86
model_args = get_model_args(
model_name="Qwen/Qwen3-Next-80B-A3B-Instruct",
spec_model_name=None,
spec_method="mtp",
tp_size=4,
model_max_len=4096,
use_async=True,
)
results = lm_eval.simple_evaluate(
model="vllm",
model_args=model_args,
tasks=TASK,
batch_size=64,
num_fewshot=8,
)
measured_value = results["results"][TASK][FILTER]
assert (
measured_value - RTOL < expected_gsm8k_value
and measured_value + RTOL > expected_gsm8k_value
), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}"

View File

@ -1,9 +1,9 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import multiprocessing
import os
import multiprocess as mp
import numpy as np
import pytest
import torch
@ -20,10 +20,12 @@ from vllm.distributed.parallel_state import (
)
from vllm.utils.system_utils import update_environment_variables
mp.set_start_method("spawn", force=True)
def distributed_run(fn, world_size):
number_of_processes = world_size
processes: list[multiprocessing.Process] = []
processes: list[mp.Process] = []
for i in range(number_of_processes):
env: dict[str, str] = {}
env["RANK"] = str(i)
@ -32,7 +34,7 @@ def distributed_run(fn, world_size):
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
env["MASTER_ADDR"] = "localhost"
env["MASTER_PORT"] = "12345"
p = multiprocessing.Process(target=fn, args=(env,))
p = mp.Process(target=fn, args=(env,))
processes.append(p)
p.start()

View File

@ -249,14 +249,13 @@ def test_compilation_config():
args = parser.parse_args(
[
"-O",
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
'"use_inductor": false}',
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
]
)
assert (
args.compilation_config.mode == 3
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
and not args.compilation_config.use_inductor
and args.compilation_config.backend == "eager"
)
# set to string form of a dict
@ -264,13 +263,13 @@ def test_compilation_config():
[
"--compilation-config="
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
'"use_inductor": true}',
'"backend": "inductor"}',
]
)
assert (
args.compilation_config.mode == 3
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
and args.compilation_config.use_inductor
and args.compilation_config.backend == "inductor"
)
@ -278,8 +277,9 @@ def test_prefix_cache_default():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
args = parser.parse_args([])
# should be None by default (depends on model).
engine_args = EngineArgs.from_cli_args(args=args)
assert not engine_args.enable_prefix_caching, "prefix caching defaults to off."
assert engine_args.enable_prefix_caching is None
# with flag to turn it on.
args = parser.parse_args(["--enable-prefix-caching"])

View File

@ -183,9 +183,6 @@ async def test_metrics_counts(
EXPECTED_METRICS_V1 = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc",
"vllm:gpu_prefix_cache_queries",
"vllm:gpu_prefix_cache_hits",
"vllm:kv_cache_usage_perc",
"vllm:prefix_cache_queries",
"vllm:prefix_cache_hits",

View File

@ -0,0 +1,71 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import pytest_asyncio
from openai import OpenAI
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-8B"
@pytest.fixture(scope="module")
def server():
args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
env_dict = dict(
VLLM_ENABLE_RESPONSES_API_STORE="1",
# uncomment for tool calling
# PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
)
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_basic(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is 13 * 24?",
)
assert response is not None
print("response: ", response)
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_reasoning_item(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input=[
{"type": "message", "content": "Hello.", "role": "user"},
{
"type": "reasoning",
"id": "lol",
"content": [
{
"type": "reasoning_text",
"text": "We need to respond: greeting.",
}
],
"summary": [],
},
],
temperature=0.0,
)
assert response is not None
assert response.status == "completed"
# make sure we get a reasoning and text output
assert response.output[0].type == "reasoning"
assert response.output[1].type == "message"
assert type(response.output[1].content[0].text) is str

View File

@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import json
import time
@ -35,7 +35,11 @@ GET_WEATHER_SCHEMA = {
@pytest.fixture(scope="module")
def server():
args = ["--enforce-eager", "--tool-server", "demo"]
assert importlib.util.find_spec("gpt_oss") is not None, (
"Harmony tests require gpt_oss package to be installed"
)
args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
env_dict = dict(
VLLM_ENABLE_RESPONSES_API_STORE="1",
PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
@ -550,6 +554,31 @@ def call_function(name, args):
raise ValueError(f"Unknown function: {name}")
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_reasoning_item(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input=[
{"type": "message", "content": "Hello.", "role": "user"},
{
"type": "reasoning",
"id": "lol",
"content": [
{
"type": "reasoning_text",
"text": "We need to respond: greeting.",
}
],
"summary": [],
},
],
temperature=0.0,
)
assert response is not None
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling(client: OpenAI, model_name: str):

View File

@ -1,6 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import MagicMock, patch
import pytest
from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
@ -132,3 +134,129 @@ def test_extract_tool_calls_multiple_json_with_surrounding_text(parser):
assert result.tool_calls[0].function.name == "searchTool"
assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
assert result.tool_calls[2].function.name == "searchTool"
def test_extract_tool_calls_deeply_nested_json(parser):
# Test with deeply nested JSON parameters (5 levels)
model_output = (
'{"name": "complexTool", '
'"parameters": {'
'"level1": {'
'"level2": {'
'"level3": {'
'"level4": {'
'"value": "deep"'
"}}}}}}"
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 1
assert result.tool_calls[0].function.name == "complexTool"
# Verify the nested structure is preserved in the arguments
import json
args = json.loads(result.tool_calls[0].function.arguments)
assert args["level1"]["level2"]["level3"]["level4"]["value"] == "deep"
def test_extract_tool_calls_multiple_with_deep_nesting(parser):
# Test with multiple tool calls where some have deeply nested parameters
model_output = (
'{"name": "simpleTool", "parameters": {"value": "test"}}; '
'{"name": "complexTool", "parameters": '
'{"config": {"database": {"connection": {"pool": {"size": 10}}}}}}'
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 2
# Check first tool call
assert result.tool_calls[0].function.name == "simpleTool"
import json
args0 = json.loads(result.tool_calls[0].function.arguments)
assert args0["value"] == "test"
# Check second tool call with deep nesting
assert result.tool_calls[1].function.name == "complexTool"
args1 = json.loads(result.tool_calls[1].function.arguments)
assert args1["config"]["database"]["connection"]["pool"]["size"] == 10
def test_extract_tool_calls_with_quotes_and_brackets_in_string(parser):
# Test with quotes and brackets inside quoted string values
model_output = (
'{"name": "searchTool", '
'"parameters": {'
'"query": "test {value} [complex]",'
'"nested": {"inner": "more {brackets}"}'
"}}"
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 1
assert result.tool_calls[0].function.name == "searchTool"
# Verify the string values are preserved including brackets and quotes
import json
args = json.loads(result.tool_calls[0].function.arguments)
assert args["query"] == "test {value} [complex]"
assert args["nested"]["inner"] == "more {brackets}"
def test_extract_tool_calls_with_escaped_quotes_in_nested_json(parser):
# Test with escaped quotes in deeply nested JSON
model_output = (
'{"name": "parserTool", "parameters": {"text": "He said \\"Hello {world}\\""}}'
)
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is True
assert len(result.tool_calls) == 1
assert result.tool_calls[0].function.name == "parserTool"
# Verify escaped quotes are preserved
import json
args = json.loads(result.tool_calls[0].function.arguments)
assert args["text"] == 'He said "Hello {world}"'
def test_extract_tool_calls_missing_name_key(parser):
# Test that missing "name" key returns content
model_output = '{"parameters": {}}'
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is False
assert len(result.tool_calls) == 0
assert result.content == model_output
def test_extract_tool_calls_missing_parameters_and_arguments_key(parser):
# Test that missing both "parameters" and "arguments" keys returns content
model_output = '{"name": "toolWithoutParams"}'
result = parser.extract_tool_calls(model_output, None)
assert result.tools_called is False
assert len(result.tool_calls) == 0
assert result.content == model_output
def test_regex_timeout_handling(parser):
"""Test regex timeout is handled gracefully"""
fake_problematic_input = "{hello world[A(A=" + "\t)A(A=,\t" * 2
# create a mock regex that raises TimeoutError
mock_regex = MagicMock()
mock_regex.finditer.side_effect = TimeoutError("Regex timeout")
with patch.object(parser, "tool_call_start_regex", mock_regex):
result = parser.extract_tool_calls(fake_problematic_input, None)
# should treat as regular text when regex times out
assert result.content == fake_problematic_input
assert result.tools_called is False
assert len(result.tool_calls) == 0
mock_regex.finditer.assert_called_once()

View File

@ -7,6 +7,12 @@ import pytest
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small"

View File

@ -7,6 +7,12 @@ import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
max_model_len = 128

View File

@ -11,6 +11,12 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
run_mteb_embed_task,
)
from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

View File

@ -9,6 +9,12 @@ import torch.nn.functional as F
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small"

View File

@ -19,6 +19,7 @@ from vllm.entrypoints.openai.protocol import (
EmbeddingResponse,
PoolingResponse,
)
from vllm.platforms import current_platform
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.utils.serial_utils import (
EMBED_DTYPE_TO_TORCH_DTYPE,
@ -28,6 +29,11 @@ from vllm.utils.serial_utils import (
decode_pooling_output,
)
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "intfloat/multilingual-e5-small"
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
DTYPE = "bfloat16"

View File

@ -12,6 +12,12 @@ from tests.models.language.pooling.embed_utils import run_embedding_correctness_
from tests.models.utils import EmbedModelInfo
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.openai.protocol import EmbeddingResponse
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODELS = [
EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),

View File

@ -16,6 +16,12 @@ import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.openai.protocol import EmbeddingResponse
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
def _generate_random_text(word_count: int) -> str:

View File

@ -13,6 +13,12 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
run_mteb_rerank,
)
from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

View File

@ -9,6 +9,12 @@ import torch
from tests.models.utils import softmax
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"

View File

@ -8,6 +8,12 @@ import torch.nn.functional as F
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODEL_NAME = "BAAI/bge-reranker-base"
DTYPE = "bfloat16"

View File

@ -10,6 +10,12 @@ from torch import tensor
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.openai.protocol import ScoreResponse
from vllm.platforms import current_platform
if current_platform.is_rocm():
pytest.skip(
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
)
MODELS = [
{"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},

View File

@ -1,7 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from openai.types.responses.response_function_tool_call_output_item import (
ResponseFunctionToolCallOutputItem,
)
from openai.types.responses.response_reasoning_item import (
Content,
ResponseReasoningItem,
Summary,
)
from vllm.entrypoints.responses_utils import (
construct_chat_message_with_tool_call,
convert_tool_responses_to_completions_format,
)
@ -28,3 +39,65 @@ class TestResponsesUtils:
result = convert_tool_responses_to_completions_format(input_tool)
assert result == {"type": "function", "function": input_tool}
def test_construct_chat_message_with_tool_call(self):
item = ResponseReasoningItem(
id="lol",
summary=[],
type="reasoning",
content=[
Content(
text="Leroy Jenkins",
type="reasoning_text",
)
],
encrypted_content=None,
status=None,
)
formatted_item = construct_chat_message_with_tool_call(item)
assert formatted_item["role"] == "assistant"
assert formatted_item["reasoning"] == "Leroy Jenkins"
item = ResponseReasoningItem(
id="lol",
summary=[
Summary(
text='Hmm, the user has just started with a simple "Hello,"',
type="summary_text",
)
],
type="reasoning",
content=None,
encrypted_content=None,
status=None,
)
formatted_item = construct_chat_message_with_tool_call(item)
assert formatted_item["role"] == "assistant"
assert (
formatted_item["reasoning"]
== 'Hmm, the user has just started with a simple "Hello,"'
)
tool_call_output = ResponseFunctionToolCallOutputItem(
id="temp_id",
type="function_call_output",
call_id="temp",
output="1234",
status="completed",
)
formatted_item = construct_chat_message_with_tool_call(tool_call_output)
assert formatted_item["role"] == "tool"
assert formatted_item["content"] == "1234"
assert formatted_item["tool_call_id"] == "temp"
item = ResponseReasoningItem(
id="lol",
summary=[],
type="reasoning",
content=None,
encrypted_content="TOP_SECRET_MESSAGE",
status=None,
)
with pytest.raises(ValueError):
construct_chat_message_with_tool_call(item)

View File

@ -13,12 +13,6 @@ from vllm.attention.layer import Attention, MultiHeadAttention
from vllm.platforms import current_platform
from vllm.utils.mem_utils import get_max_shared_memory_bytes
if not current_platform.is_rocm():
from xformers import ops as xops
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from tests.kernels.utils import make_alibi_bias
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability.
# - 512 as a buffer
@ -448,129 +442,6 @@ def ref_multi_query_kv_attention(
return torch.cat(ref_outputs, dim=0)
@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.skipif(
current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
)
@torch.inference_mode()
def test_multi_query_kv_attention(
num_seqs: int,
num_heads: tuple[int, int],
head_size: int,
dtype: torch.dtype,
seed: int,
device: str,
use_alibi: bool = False,
) -> None:
current_platform.seed_everything(seed)
torch.set_default_device(device)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use
# a smaller MAX_SEQ_LEN here.
max_len = min(MAX_SEQ_LEN, 4096)
seq_lens = random.sample(range(1, max_len), num_seqs)
num_tokens = sum(seq_lens)
scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads
qkv = torch.empty(
num_tokens, num_query_heads + 2 * num_kv_heads, head_size, dtype=dtype
)
qkv.uniform_(-scale, scale)
query, key, value = qkv.split([num_query_heads, num_kv_heads, num_kv_heads], dim=1)
num_queries_per_kv = num_query_heads // num_kv_heads
if num_queries_per_kv > 1:
# Handle MQA and GQA
key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
alibi_bias = None
if use_alibi:
alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
output = torch.empty_like(query)
start = 0
# Dynamic sequence length not supported with custom attn_bias.
for i, seq_len in enumerate(seq_lens):
end = start + seq_len
out = xops.memory_efficient_attention_forward(
query[None, start:end],
key[None, start:end],
value[None, start:end],
attn_bias=attn_bias[i],
p=0.0,
scale=scale,
)
output[start:end].copy_(out.view_as(query[start:end]))
start += seq_len
# xformers.AttentionBias to Tensor for use in reference impl.
alibi_bias = [
b.materialize((1, num_query_heads, i, i), device=device).squeeze()
for b, i in zip(attn_bias, seq_lens)
]
else:
attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
output = xops.memory_efficient_attention_forward(
query.unsqueeze(0),
key.unsqueeze(0),
value.unsqueeze(0),
attn_bias=attn_bias,
p=0.0,
scale=scale,
)
output = output.squeeze(0)
cu_seq_lens = [0]
for seq_len in seq_lens:
cu_seq_lens.append(cu_seq_lens[-1] + seq_len)
ref_output = ref_multi_query_kv_attention(
cu_seq_lens,
query,
key,
value,
scale,
alibi_bias,
dtype,
)
atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", [64])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.skipif(
current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
)
@torch.inference_mode()
def test_multi_query_kv_attention_with_alibi(
num_seqs: int,
num_heads: tuple[int, int],
head_size: int,
dtype: torch.dtype,
seed: int,
device: str,
) -> None:
return test_multi_query_kv_attention(
num_seqs,
num_heads,
head_size,
dtype,
seed,
device,
use_alibi=True,
)
@pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
head_size = 64

View File

@ -34,7 +34,7 @@ DEVICE_MLA_BACKENDS = {
}
DEVICE_REGULAR_ATTN_BACKENDS = {
"cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
"cuda": ["FLASHINFER", "FLASH_ATTN"],
"hip": ["ROCM_ATTN"],
"cpu": ["CPU_ATTN"],
}
@ -207,12 +207,6 @@ def test_env(
)
expected = "FLASHINFER"
assert backend.get_name() == expected
elif name == "XFORMERS":
backend = get_attn_backend(
32, torch.float16, None, block_size, use_mla=use_mla
)
expected = "XFORMERS"
assert backend.get_name() == expected
elif name == "FLASH_ATTN":
backend = get_attn_backend(
32, torch.float16, None, block_size, use_mla=use_mla

View File

@ -921,12 +921,16 @@ def test_gather_and_maybe_dequant_cache_mla(
)
_fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
seq_len_tensor = torch.randint(0, max_seq_len + 1, (batch_size,), device=device)
seq_len_tensor = torch.randint(
max_seq_len, max_seq_len + 1, (batch_size,), device=device
)
total_tokens = seq_len_tensor.sum()
cu_seq_lens = torch.empty((batch_size + 1), dtype=torch.int32, device=device)
cu_seq_lens[0] = 0
cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
token_to_seq = torch.arange(0, batch_size, dtype=torch.int32, device=device)
token_to_seq = torch.repeat_interleave(token_to_seq, seq_len_tensor)
print("seq_len_tensor", seq_len_tensor)
tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
@ -977,7 +981,8 @@ def test_gather_and_maybe_dequant_cache_mla(
dst,
block_table,
cu_seq_lens,
batch_size,
token_to_seq,
total_tokens,
kv_cache_dtype,
scale,
None,
@ -990,7 +995,8 @@ def test_gather_and_maybe_dequant_cache_mla(
dst,
block_table,
cu_seq_lens,
batch_size,
token_to_seq,
total_tokens,
kv_cache_dtype,
scale,
None,

View File

@ -24,10 +24,6 @@ from vllm.platforms.rocm import RocmPlatform
def clear_cache():
"""Clear lru cache to ensure each test case runs without caching."""
_cached_get_attn_backend.cache_clear()
# Clear xformers availability cache
import vllm.attention.layer as layer_module
layer_module.USE_XFORMERS_OPS = None
@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])

View File

@ -39,6 +39,11 @@ MNK_FACTORS = [
NUM_EXPERTS = [8, 64]
TOP_KS = [1, 2, 6]
DTYPES = [torch.bfloat16]
if not current_platform.is_fp8_fnuz():
DTYPES.append(torch.float8_e4m3fn)
vllm_config = VllmConfig()
@ -96,7 +101,7 @@ class BatchedMMTensors:
@pytest.mark.parametrize("max_tokens_per_expert", [32, 224, 512])
@pytest.mark.parametrize("K", [128, 1024])
@pytest.mark.parametrize("N", [128, 1024])
@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("per_act_token_quant", [False, True])
def test_batched_mm(
@ -229,7 +234,7 @@ def test_batched_mm(
@pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("per_act_token_quant", [False, True])
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("input_scales", [False])

View File

@ -31,6 +31,11 @@ dg_available = has_deep_gemm()
if current_platform.get_device_capability() < (9, 0):
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
if current_platform.is_fp8_fnuz():
pytest.skip(
"Tests in this file require float8_e4m3fn and platform does not support",
allow_module_level=True,
)
vllm_config = VllmConfig()

View File

@ -11,7 +11,6 @@ from vllm.model_executor.layers.fused_moe.config import (
fp8_w8a8_moe_quant_config,
)
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
apply_flashinfer_per_tensor_scale_fp8,
flashinfer_cutlass_moe_fp8,
@ -151,14 +150,11 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True)
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids = Llama4MoE.custom_routing_function(
hidden_states=td.hidden_states,
router_logits=score,
use_grouped_topk=False,
top_k=topk,
gating_output=score,
topk=topk,
renormalize=False,
custom_routing_function=Llama4MoE.custom_routing_function,
scoring_func="softmax",
)
quant_config = fp8_w8a8_moe_quant_config(
@ -219,14 +215,11 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
)
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
topk_weights, topk_ids, _ = FusedMoE.select_experts(
topk_weights, topk_ids = Llama4MoE.custom_routing_function(
hidden_states=td.hidden_states,
router_logits=score,
use_grouped_topk=False,
top_k=topk,
gating_output=score,
topk=topk,
renormalize=False,
custom_routing_function=Llama4MoE.custom_routing_function,
scoring_func="softmax",
)
quant_config = fp8_w8a8_moe_quant_config(

View File

@ -270,6 +270,11 @@ class Case:
@pytest.mark.parametrize("num_token", [2])
@pytest.mark.parametrize("tp", [1, 2, 4, 8])
def test_equiv(num_token, a_dtype, w_dtype, tp):
from triton_kernels.tensor_details import layout
if not hasattr(layout, "make_default_matmul_mxfp4_w_layout"):
pytest.skip("make_default_matmul_mxfp4_w_layout not available")
M = num_token
E = ModelConfig.num_experts
K = ModelConfig.hidden_size

View File

@ -46,6 +46,12 @@ meets_multi_gpu_requirements = pytest.mark.skipif(
reason="Requires deep_ep or deep_gemm or pplx or flashinfer packages",
)
if current_platform.is_fp8_fnuz():
pytest.skip(
"Tests in this file require float8_e4m3fn and platform does not support",
allow_module_level=True,
)
def format_result(verbose, msg, ex=None):
if ex is not None:

View File

@ -23,6 +23,12 @@ TOP_KS = [2, 6, 8]
EP_SIZE = [1, 4, 16]
current_platform.seed_everything(0)
if current_platform.is_rocm():
pytest.skip(
"moe_permute_unpermute_supported is not defined for ROCm",
allow_module_level=True,
)
def torch_permute(
hidden_states: torch.Tensor,

Some files were not shown because too many files have changed in this diff Show More