mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-02 10:27:12 +08:00
Merge branch 'main' into rename_file_info_to_pkg/file
This commit is contained in:
commit
f851177b97
@ -7,53 +7,51 @@ set -ex
|
||||
# allow to bind to different cores
|
||||
CORE_RANGE=${CORE_RANGE:-0-16}
|
||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
|
||||
NUMA_NODE=${NUMA_NODE:-0}
|
||||
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=16
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
set -e;
|
||||
docker rm -f cpu-test-"$NUMA_NODE" || true;
|
||||
docker rm -f cpu-test || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
# Run the image
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
export NUMA_NODE=$2
|
||||
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run kernel tests
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/kernels/test_onednn.py
|
||||
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
|
||||
|
||||
# basic online serving
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||
docker exec cpu-test bash -c '
|
||||
set -e
|
||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
|
||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
|
||||
server_pid=$!
|
||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||
vllm bench serve \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--model Qwen/Qwen3-0.6B \
|
||||
--num-prompts 20 \
|
||||
--endpoint /v1/completions
|
||||
kill -s SIGTERM $server_pid &'
|
||||
@ -61,4 +59,4 @@ function cpu_tests() {
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
export -f cpu_tests
|
||||
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||
timeout 2h bash -c cpu_tests
|
||||
|
||||
@ -1,10 +1,12 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
|
||||
THRESHOLD=${1:-0.8}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8020}
|
||||
DATA_PARALLEL_SIZE=${4:-2}
|
||||
TENSOR_PARALLEL_SIZE=${5:-2}
|
||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||
mkdir -p "${OUT_DIR}"
|
||||
|
||||
@ -45,8 +47,10 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 2 \
|
||||
--data-parallel-size 2 \
|
||||
--enable-eplb \
|
||||
--eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
|
||||
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
|
||||
--data-parallel-size ${DATA_PARALLEL_SIZE} \
|
||||
--enable-expert-parallel \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
@ -1486,4 +1486,4 @@ steps:
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||
|
||||
@ -192,6 +192,7 @@ steps:
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||
@ -631,6 +632,7 @@ steps:
|
||||
# we can only upgrade after this is resolved
|
||||
# TODO(jerryzh168): resolve the above comment
|
||||
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||
- uv pip install --system conch-triton-kernels
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
@ -902,11 +904,12 @@ steps:
|
||||
- label: Transformers Nightly Models Test
|
||||
working_dir: "/vllm-workspace/"
|
||||
optional: true
|
||||
soft_fail: true
|
||||
commands:
|
||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||
- pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
|
||||
- pytest -v -s tests/models/test_initialization.py
|
||||
- pytest -v -s tests/models/test_transformers.py
|
||||
# - pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/processing/
|
||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||
- python3 examples/offline_inference/basic/chat.py
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
@ -1116,6 +1119,7 @@ steps:
|
||||
# https://github.com/NVIDIA/nccl/issues/1838
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
@ -1340,11 +1344,20 @@ steps:
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
|
||||
timeout_in_minutes: 60
|
||||
gpu: b200
|
||||
optional: true
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
2
.github/workflows/cleanup_pr_body.yml
vendored
2
.github/workflows/cleanup_pr_body.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||
|
||||
25
.github/workflows/issue_autolabel.yml
vendored
25
.github/workflows/issue_autolabel.yml
vendored
@ -105,6 +105,31 @@ jobs:
|
||||
}
|
||||
],
|
||||
},
|
||||
cpu: {
|
||||
// Keyword search - matches whole words only (with word boundaries)
|
||||
keywords: [
|
||||
{
|
||||
term: "CPU Backend",
|
||||
searchIn: "title"
|
||||
},
|
||||
{
|
||||
term: "x86",
|
||||
searchIn: "title"
|
||||
},
|
||||
{
|
||||
term: "ARM",
|
||||
searchIn: "title"
|
||||
},
|
||||
{
|
||||
term: "Apple Silicon",
|
||||
searchIn: "title"
|
||||
},
|
||||
{
|
||||
term: "IBM Z",
|
||||
searchIn: "title"
|
||||
},
|
||||
],
|
||||
},
|
||||
// Add more label configurations here as needed
|
||||
// example: {
|
||||
// keywords: [...],
|
||||
|
||||
2
.github/workflows/macos-smoke-test.yml
vendored
2
.github/workflows/macos-smoke-test.yml
vendored
@ -12,7 +12,7 @@ jobs:
|
||||
timeout-minutes: 30
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- uses: astral-sh/setup-uv@v7
|
||||
with:
|
||||
|
||||
2
.github/workflows/pre-commit.yml
vendored
2
.github/workflows/pre-commit.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
|
||||
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||
with:
|
||||
python-version: "3.12"
|
||||
|
||||
@ -136,7 +136,7 @@ elseif(HIP_FOUND)
|
||||
|
||||
# ROCm 5.X and 6.X
|
||||
if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
|
||||
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
|
||||
Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
|
||||
message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
|
||||
"expected for ROCm build, saw ${Torch_VERSION} instead.")
|
||||
endif()
|
||||
@ -604,12 +604,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
set(SRCS
|
||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
|
||||
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
||||
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
|
||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${FP4_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
|
||||
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
||||
|
||||
@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
|
||||
|
||||
*Latest News* 🔥
|
||||
|
||||
- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
|
||||
- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
|
||||
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
|
||||
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
|
||||
|
||||
Currently this just includes dense GEMMs and only works on Hopper GPUs.
|
||||
Currently, this just includes dense GEMMs and only works on Hopper GPUs.
|
||||
|
||||
## Setup
|
||||
|
||||
|
||||
@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel(
|
||||
scalar_t* output, float* output_lse, const scalar_t* prefix_output,
|
||||
const float* prefix_lse, const scalar_t* suffix_output,
|
||||
const float* suffix_lse, const uint num_tokens, const uint num_heads,
|
||||
const uint head_size) {
|
||||
const uint head_size, const uint prefix_head_stride,
|
||||
const uint output_head_stride) {
|
||||
using pack_128b_t = uint4;
|
||||
const uint pack_size = 16 / sizeof(scalar_t);
|
||||
const uint threads_per_head = head_size / pack_size;
|
||||
@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel(
|
||||
const uint head_idx = token_head_idx % num_heads;
|
||||
|
||||
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
|
||||
const uint head_offset =
|
||||
token_idx * num_heads * head_size + head_idx * head_size;
|
||||
const scalar_t* prefix_head_ptr = prefix_output + head_offset;
|
||||
const scalar_t* suffix_head_ptr = suffix_output + head_offset;
|
||||
scalar_t* output_head_ptr = output + head_offset;
|
||||
const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
|
||||
head_idx * prefix_head_stride;
|
||||
const uint dst_head_offset = token_idx * num_heads * output_head_stride +
|
||||
head_idx * output_head_stride;
|
||||
const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
|
||||
const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
|
||||
scalar_t* output_head_ptr = output + dst_head_offset;
|
||||
|
||||
float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
|
||||
float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
|
||||
@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel(
|
||||
reinterpret_cast<float*>(prefix_lse.data_ptr()), \
|
||||
reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \
|
||||
reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \
|
||||
num_heads, head_size); \
|
||||
num_heads, head_size, prefix_head_stride, output_head_stride); \
|
||||
}
|
||||
|
||||
/*@brief Merges the attention states from prefix and suffix
|
||||
@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output,
|
||||
const uint num_tokens = output.size(0);
|
||||
const uint num_heads = output.size(1);
|
||||
const uint head_size = output.size(2);
|
||||
const uint prefix_head_stride = prefix_output.stride(1);
|
||||
const uint output_head_stride = output.stride(1);
|
||||
const uint pack_size = 16 / sizeof(scalar_t);
|
||||
TORCH_CHECK(head_size % pack_size == 0,
|
||||
"headsize must be multiple of pack_size:", pack_size);
|
||||
TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
|
||||
"output heads must be contiguous in memory");
|
||||
TORCH_CHECK(
|
||||
prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
|
||||
"prefix_output heads must be contiguous in memory");
|
||||
TORCH_CHECK(
|
||||
suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
|
||||
"suffix_output heads must be contiguous in memory");
|
||||
float* output_lse_ptr = nullptr;
|
||||
if (output_lse.has_value()) {
|
||||
output_lse_ptr = output_lse.value().data_ptr<float>();
|
||||
|
||||
11
csrc/cache.h
11
csrc/cache.h
@ -41,11 +41,12 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
const double scale, const std::string& kv_cache_dtype);
|
||||
|
||||
void gather_and_maybe_dequant_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
|
||||
int64_t num_tokens, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& scale,
|
||||
std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||
|
||||
|
||||
@ -905,91 +905,79 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
namespace vllm {
|
||||
|
||||
// grid is launched with dimensions (batch, num_splits)
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt,
|
||||
int ENTRY_SIZE, int CTA_SIZE>
|
||||
__global__ void gather_and_maybe_dequant_cache(
|
||||
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
|
||||
// ENTRIES...]
|
||||
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
|
||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
|
||||
const int32_t block_size, const int32_t entry_size,
|
||||
const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
|
||||
// ENTRIES...]
|
||||
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
|
||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
|
||||
const int32_t* __restrict__ token_to_seq, // [MAX_TOKEN_ACROSS_CHUNK]
|
||||
const int32_t num_tokens, const int32_t block_size,
|
||||
const int64_t block_table_stride, const int64_t cache_block_stride,
|
||||
const int64_t cache_entry_stride, const int64_t dst_entry_stride,
|
||||
const float* __restrict__ scale,
|
||||
const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per
|
||||
// batch
|
||||
constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
|
||||
using ltype = vllm::vec_n_t<cache_t, vec_size>;
|
||||
using stype = vllm::vec_n_t<scalar_t, vec_size>;
|
||||
// We are adding this for code readability which will be optimized out when
|
||||
// build in release.
|
||||
assert(CTA_SIZE == blockDim.x);
|
||||
|
||||
const int64_t bid = blockIdx.x; // Batch ID
|
||||
const int32_t num_splits = gridDim.y;
|
||||
const int32_t split = blockIdx.y;
|
||||
const int32_t seq_start = cu_seq_lens[bid];
|
||||
const int32_t seq_end = cu_seq_lens[bid + 1];
|
||||
const int32_t seq_len = seq_end - seq_start;
|
||||
const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
|
||||
const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
|
||||
#pragma unroll
|
||||
for (int token_id = blockIdx.x; token_id < num_tokens;
|
||||
token_id += gridDim.x) {
|
||||
int64_t batch_id = token_to_seq[token_id];
|
||||
int64_t batch_start = cu_seq_lens[batch_id];
|
||||
int64_t batch_end = cu_seq_lens[batch_id + 1];
|
||||
int32_t batch_offset = token_id - batch_start;
|
||||
|
||||
const int32_t split_start = split * split_blocks;
|
||||
const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
|
||||
if (token_id >= batch_end) return;
|
||||
int32_t offset = 0;
|
||||
if (seq_starts != nullptr) {
|
||||
offset = seq_starts[batch_id];
|
||||
}
|
||||
batch_offset += offset;
|
||||
int32_t block_table_id = batch_offset / block_size;
|
||||
int32_t slot_id = batch_offset % block_size;
|
||||
int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
|
||||
int32_t block_id = block_table[block_table_offset];
|
||||
int64_t cache_offset =
|
||||
block_id * cache_block_stride + slot_id * cache_entry_stride;
|
||||
constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
|
||||
scalar_t* dst_ = dst + token_id * dst_entry_stride;
|
||||
cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;
|
||||
|
||||
const bool is_active_split = (split_start < tot_blocks);
|
||||
const bool is_last_split = (split_end == tot_blocks);
|
||||
|
||||
if (!is_active_split) return;
|
||||
|
||||
int32_t full_blocks_end = split_end;
|
||||
int32_t partial_block_size = 0;
|
||||
|
||||
// Adjust the pointer for the block_table for this batch.
|
||||
// If seq_starts is provided, compute an offset based on (seq_starts[bid] /
|
||||
// page_size)
|
||||
const int32_t batch_offset = bid * block_table_stride;
|
||||
int32_t offset = 0;
|
||||
if (seq_starts != nullptr) {
|
||||
offset = seq_starts[bid] / block_size;
|
||||
}
|
||||
const int32_t* batch_block_table = block_table + batch_offset + offset;
|
||||
|
||||
// Adjust dst pointer based on the cumulative sequence lengths.
|
||||
dst += seq_start * dst_entry_stride;
|
||||
|
||||
if (is_last_split) {
|
||||
partial_block_size = seq_len % block_size;
|
||||
if (partial_block_size) full_blocks_end -= 1;
|
||||
}
|
||||
|
||||
auto copy_entry = [&](const cache_t* __restrict__ _src,
|
||||
scalar_t* __restrict__ _dst) {
|
||||
for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
|
||||
#pragma unroll
|
||||
for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) {
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
_dst[i] = static_cast<scalar_t>(_src[i]);
|
||||
reinterpret_cast<stype*>(dst_)[idx] =
|
||||
static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
|
||||
} else {
|
||||
_dst[i] =
|
||||
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
|
||||
ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx];
|
||||
stype store_val;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < vec_size; ++j) {
|
||||
store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
|
||||
loaded_val.val[j], *scale);
|
||||
}
|
||||
reinterpret_cast<stype*>(dst_)[idx] = store_val;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const auto loop_end =
|
||||
std::min((int64_t)full_blocks_end, block_table_stride - offset);
|
||||
for (int pid = split_start; pid < loop_end; ++pid) {
|
||||
auto block_id = batch_block_table[pid];
|
||||
auto block_start_ptr = src_cache + block_id * cache_block_stride;
|
||||
auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
|
||||
for (int eid = 0; eid < block_size; ++eid) {
|
||||
copy_entry(block_start_ptr + eid * cache_entry_stride,
|
||||
block_dst_ptr + eid * dst_entry_stride);
|
||||
}
|
||||
}
|
||||
|
||||
if (partial_block_size) {
|
||||
if (offset + full_blocks_end < block_table_stride) {
|
||||
auto block_id = batch_block_table[full_blocks_end];
|
||||
auto block_start_ptr = src_cache + block_id * cache_block_stride;
|
||||
auto block_dst_ptr =
|
||||
dst + full_blocks_end * block_size * dst_entry_stride;
|
||||
for (int eid = 0; eid < partial_block_size; ++eid) {
|
||||
copy_entry(block_start_ptr + eid * cache_entry_stride,
|
||||
block_dst_ptr + eid * dst_entry_stride);
|
||||
// process tail
|
||||
constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
|
||||
dst_ = dst_ + ENTRY_SIZE - tail_cnt;
|
||||
src_ = src_ + ENTRY_SIZE - tail_cnt;
|
||||
#pragma unroll
|
||||
for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) {
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
dst_[idx] = static_cast<scalar_t>(src_[idx]);
|
||||
} else {
|
||||
dst_[idx] =
|
||||
fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1001,34 +989,38 @@ __global__ void gather_and_maybe_dequant_cache(
|
||||
// SCALAR_T is the data type of the destination tensor.
|
||||
// CACHE_T is the stored data type of kv-cache.
|
||||
// KV_DTYPE is the real data type of kv-cache.
|
||||
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
|
||||
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
|
||||
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
|
||||
block_size, entry_size, block_table_stride, cache_block_stride, \
|
||||
cache_entry_stride, dst_entry_stride, \
|
||||
reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
|
||||
#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576, \
|
||||
thread_block_size> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<CACHE_T*>(src_cache.data_ptr()), \
|
||||
reinterpret_cast<SCALAR_T*>(dst.data_ptr()), \
|
||||
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
|
||||
token_to_seq.data_ptr<int32_t>(), num_tokens, block_size, \
|
||||
block_table_stride, cache_block_stride, cache_entry_stride, \
|
||||
dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
|
||||
seq_starts_ptr);
|
||||
|
||||
// Gather sequences from the cache into the destination tensor.
|
||||
// - cu_seq_lens contains the cumulative sequence lengths for each batch
|
||||
// - block_table contains the cache block indices for each sequence
|
||||
// - token_to_seq contains the back mapping from token_id to batch_id
|
||||
// - Optionally, seq_starts (if provided) offsets the starting block index by
|
||||
// (seq_starts[bid] / page_size)
|
||||
void gather_and_maybe_dequant_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS]
|
||||
int64_t num_tokens, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& scale,
|
||||
std::optional<torch::Tensor> seq_starts = std::nullopt) {
|
||||
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
int32_t block_size = src_cache.size(1);
|
||||
int32_t entry_size = src_cache.flatten(2, -1).size(2);
|
||||
int32_t head_dim = dst.size(-1);
|
||||
|
||||
TORCH_CHECK(block_table.dtype() == torch::kInt32,
|
||||
"block_table must be int32");
|
||||
@ -1038,6 +1030,9 @@ void gather_and_maybe_dequant_cache(
|
||||
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
|
||||
"seq_starts must be int32");
|
||||
}
|
||||
TORCH_CHECK(head_dim == 576,
|
||||
"gather_and_maybe_dequant_cache only support the head_dim to 576 "
|
||||
"for better performance")
|
||||
|
||||
TORCH_CHECK(src_cache.device() == dst.device(),
|
||||
"src_cache and dst must be on the same device");
|
||||
@ -1055,10 +1050,9 @@ void gather_and_maybe_dequant_cache(
|
||||
int64_t cache_entry_stride = src_cache.stride(1);
|
||||
int64_t dst_entry_stride = dst.stride(0);
|
||||
|
||||
// Decide on the number of splits based on the batch size.
|
||||
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
||||
dim3 grid(batch_size, num_splits);
|
||||
dim3 block(1024);
|
||||
constexpr int32_t thread_block_size = 64;
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(thread_block_size);
|
||||
|
||||
const int32_t* seq_starts_ptr =
|
||||
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
|
||||
|
||||
@ -847,7 +847,7 @@ struct VecTypeTrait<c10::BFloat16> {
|
||||
};
|
||||
#endif
|
||||
|
||||
#if !defined(__powerpc__)
|
||||
#if !defined(__powerpc__) && !defined(__s390x__)
|
||||
template <>
|
||||
struct VecTypeTrait<c10::Half> {
|
||||
using vec_t = vec_op::FP16Vec16;
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
|
||||
#include <vecintrin.h>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <torch/all.h>
|
||||
namespace vec_op {
|
||||
|
||||
@ -174,8 +175,9 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||
}
|
||||
|
||||
explicit FP32Vec8(const BF16Vec8& v) {
|
||||
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
|
||||
reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
|
||||
// On big-endian s390x, place BF16 first to get correct byte order
|
||||
reg.val[0] = (__vector float)vec_mergeh(v.reg, zero);
|
||||
reg.val[1] = (__vector float)vec_mergel(v.reg, zero);
|
||||
}
|
||||
|
||||
float reduce_sum() const {
|
||||
@ -189,51 +191,257 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||
}
|
||||
|
||||
FP32Vec8 exp() const {
|
||||
// TODO: Vectorize this
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
f32x4x4_t ret;
|
||||
ret.val[0][0] = std::exp(ar.values[0]);
|
||||
ret.val[0][1] = std::exp(ar.values[1]);
|
||||
ret.val[0][2] = std::exp(ar.values[2]);
|
||||
ret.val[0][3] = std::exp(ar.values[3]);
|
||||
ret.val[1][0] = std::exp(ar.values[4]);
|
||||
ret.val[1][1] = std::exp(ar.values[5]);
|
||||
ret.val[1][2] = std::exp(ar.values[6]);
|
||||
ret.val[1][3] = std::exp(ar.values[7]);
|
||||
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
|
||||
f32x4x2_t out;
|
||||
|
||||
const __vector float log2e = vec_splats(1.44269504088896341f);
|
||||
const __vector float one = vec_splats(1.0f);
|
||||
const __vector float min_x = vec_splats(-87.3f);
|
||||
const __vector float max_x = vec_splats(88.7f);
|
||||
|
||||
// 5th-degree minimax polynomial for 2^r (r in [0,1))
|
||||
const __vector float c1 = vec_splats(0.6931471805599453f);
|
||||
const __vector float c2 = vec_splats(0.240226506959101f);
|
||||
const __vector float c3 = vec_splats(0.05550410866482158f);
|
||||
const __vector float c4 = vec_splats(0.009618129107628477f);
|
||||
const __vector float c5 = vec_splats(0.0013333558146428443f);
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
__vector float x = reg.val[i];
|
||||
|
||||
x = vec_max(x, min_x);
|
||||
x = vec_min(x, max_x);
|
||||
|
||||
__vector float y = vec_mul(x, log2e);
|
||||
|
||||
__vector float kf = vec_floor(y);
|
||||
__vector float r = vec_sub(y, kf);
|
||||
|
||||
__vector signed int k = vec_signed(kf);
|
||||
const __vector signed int min_k = vec_splats((signed int)-126);
|
||||
const __vector signed int max_k = vec_splats((signed int)127);
|
||||
k = vec_min(vec_max(k, min_k), max_k);
|
||||
|
||||
// Build 2^k from exponent bits
|
||||
__vector signed int exp_int = vec_add(k, vec_splats((signed int)127));
|
||||
__vector unsigned int bits = (__vector unsigned int)exp_int;
|
||||
bits = vec_sl(bits, vec_splats((unsigned int)23));
|
||||
__vector float pow2k = (__vector float)bits;
|
||||
|
||||
// Improved minimax polynomial
|
||||
__vector float poly = vec_madd(c5, r, c4);
|
||||
poly = vec_madd(poly, r, c3);
|
||||
poly = vec_madd(poly, r, c2);
|
||||
poly = vec_madd(poly, r, c1);
|
||||
poly = vec_madd(poly, r, one);
|
||||
|
||||
out.val[i] = vec_mul(pow2k, poly);
|
||||
}
|
||||
|
||||
return FP32Vec8(out);
|
||||
}
|
||||
|
||||
FP32Vec8 tanh() const {
|
||||
// TODO: Vectorize this
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
f32x4x4_t ret;
|
||||
ret.val[0][0] = std::tanh(ar.values[0]);
|
||||
ret.val[0][1] = std::tanh(ar.values[1]);
|
||||
ret.val[0][2] = std::tanh(ar.values[2]);
|
||||
ret.val[0][3] = std::tanh(ar.values[3]);
|
||||
ret.val[1][0] = std::tanh(ar.values[4]);
|
||||
ret.val[1][1] = std::tanh(ar.values[5]);
|
||||
ret.val[1][2] = std::tanh(ar.values[6]);
|
||||
ret.val[1][3] = std::tanh(ar.values[7]);
|
||||
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
|
||||
// tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
|
||||
const __vector float one = vec_splats(1.0f);
|
||||
const __vector float two = vec_splats(2.0f);
|
||||
const __vector float zero = vec_splats(0.0f);
|
||||
const __vector float sat =
|
||||
vec_splats(9.0f); // beyond this, tanh(x) ~ sign(x)
|
||||
|
||||
f32x4x2_t out;
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
__vector float x = reg.val[i];
|
||||
__vector float ax = vec_abs(x);
|
||||
|
||||
// sign(x): +1 or -1
|
||||
__vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
|
||||
|
||||
// saturation mask: |x| > sat
|
||||
__vector __bool int saturated = vec_cmpgt(ax, sat);
|
||||
|
||||
// 2x
|
||||
__vector float two_x = vec_mul(x, two);
|
||||
|
||||
// Build a temporary FP32Vec8 with both lanes = 2x, reuse exp()
|
||||
f32x4x2_t tmp;
|
||||
tmp.val[0] = two_x;
|
||||
tmp.val[1] = two_x;
|
||||
FP32Vec8 exp_2x_vec(tmp);
|
||||
|
||||
FP32Vec8 e2x = exp_2x_vec.exp();
|
||||
__vector float e = e2x.reg.val[i];
|
||||
|
||||
// tanh(x) = (e - 1) / (e + 1)
|
||||
__vector float num = vec_sub(e, one);
|
||||
__vector float den = vec_add(e, one);
|
||||
|
||||
__vector float t = vec_div(num, den);
|
||||
|
||||
// For large |x|, clamp to sign(x)
|
||||
out.val[i] = vec_sel(t, sign, saturated);
|
||||
}
|
||||
|
||||
return FP32Vec8(out);
|
||||
}
|
||||
|
||||
FP32Vec8 er() const {
|
||||
// TODO: Vectorize this
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
f32x4x4_t ret;
|
||||
ret.val[0][0] = std::erf(ar.values[0]);
|
||||
ret.val[0][1] = std::erf(ar.values[1]);
|
||||
ret.val[0][2] = std::erf(ar.values[2]);
|
||||
ret.val[0][3] = std::erf(ar.values[3]);
|
||||
ret.val[1][0] = std::erf(ar.values[4]);
|
||||
ret.val[1][1] = std::erf(ar.values[5]);
|
||||
ret.val[1][2] = std::erf(ar.values[6]);
|
||||
ret.val[1][3] = std::erf(ar.values[7]);
|
||||
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
|
||||
// A&S 7.1.26 approximation:
|
||||
// erf(x) = sign(x) * (1 - ((((a5*t + a4)*t + a3)*t + a2)*t + a1) * t *
|
||||
// exp(-x^2)) t = 1 / (1 + p*|x|), p = 0.3275911
|
||||
|
||||
const __vector float one = vec_splats(1.0f);
|
||||
const __vector float zero = vec_splats(0.0f);
|
||||
const __vector float p = vec_splats(0.3275911f);
|
||||
|
||||
// Polynomial coeffs
|
||||
const __vector float a1 = vec_splats(0.254829592f);
|
||||
const __vector float a2 = vec_splats(-0.284496736f);
|
||||
const __vector float a3 = vec_splats(1.421413741f);
|
||||
const __vector float a4 = vec_splats(-1.453152027f);
|
||||
const __vector float a5 = vec_splats(1.061405429f);
|
||||
|
||||
// Threshold where erf(x) ~ sign(x)
|
||||
const __vector float sat = vec_splats(6.0f);
|
||||
|
||||
f32x4x2_t out;
|
||||
|
||||
for (int lane = 0; lane < 2; lane++) {
|
||||
__vector float x = reg.val[lane];
|
||||
__vector float ax = vec_abs(x);
|
||||
|
||||
// sign(x)
|
||||
__vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
|
||||
|
||||
// |x| > 6 → erf(x) = ±1
|
||||
__vector __bool int saturated = vec_cmpgt(ax, sat);
|
||||
|
||||
// t = 1 / (1 + p * |x|)
|
||||
__vector float t = vec_madd(p, ax, one);
|
||||
t = vec_div(one, t);
|
||||
|
||||
// poly = a5
|
||||
__vector float poly = a5;
|
||||
poly = vec_madd(poly, t, a4);
|
||||
poly = vec_madd(poly, t, a3);
|
||||
poly = vec_madd(poly, t, a2);
|
||||
poly = vec_madd(poly, t, a1);
|
||||
|
||||
// full polynomial: poly = poly * t
|
||||
poly = vec_mul(poly, t);
|
||||
|
||||
// Compute exp(-x^2)
|
||||
__vector float x2 = vec_mul(x, x);
|
||||
__vector float neg_x2 = vec_neg(x2);
|
||||
|
||||
f32x4x2_t tmp;
|
||||
tmp.val[0] = neg_x2;
|
||||
tmp.val[1] = neg_x2;
|
||||
FP32Vec8 exp_neg_x2(tmp);
|
||||
|
||||
FP32Vec8 e = exp_neg_x2.exp();
|
||||
__vector float ex = e.reg.val[lane];
|
||||
|
||||
// erf(x) = sign * (1 - poly * exp(-x^2))
|
||||
__vector float term = vec_mul(poly, ex);
|
||||
__vector float y = vec_sub(one, term);
|
||||
y = vec_mul(y, sign);
|
||||
|
||||
// saturated → ±1
|
||||
__vector float sat_val = vec_mul(sign, one);
|
||||
out.val[lane] = vec_sel(y, sat_val, saturated);
|
||||
}
|
||||
|
||||
return FP32Vec8(out);
|
||||
}
|
||||
// Elementwise sigmoid(x) = 1 / (1 + exp(-x))
|
||||
FP32Vec8 sigmoid() const {
|
||||
const __vector float one = vec_splats(1.0f);
|
||||
|
||||
f32x4x2_t neg;
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
neg.val[i] = vec_neg(reg.val[i]);
|
||||
}
|
||||
|
||||
FP32Vec8 neg_x(neg);
|
||||
FP32Vec8 e = neg_x.exp(); // exp(-x)
|
||||
|
||||
f32x4x2_t denom;
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
denom.val[i] = vec_add(one, e.reg.val[i]);
|
||||
}
|
||||
|
||||
FP32Vec8 denom_vec(denom);
|
||||
FP32Vec8 one_vec(1.0f);
|
||||
|
||||
return one_vec / denom_vec;
|
||||
}
|
||||
|
||||
// Tanh-based GELU:
|
||||
// gelu(x) = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x^3)))
|
||||
FP32Vec8 gelu_tanh() const {
|
||||
const __vector float k_s2pi = vec_splats(0.7978845608028654f); // √(2/π)
|
||||
const __vector float k_0_0447 = vec_splats(0.044715f);
|
||||
|
||||
f32x4x2_t x2, x3, inner;
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
__vector float x = reg.val[i];
|
||||
x2.val[i] = vec_mul(x, x); // x^2
|
||||
x3.val[i] = vec_mul(x2.val[i], x); // x^3
|
||||
__vector float t = vec_madd(k_0_0447, x3.val[i], x); // x + 0.044715*x^3
|
||||
inner.val[i] = vec_mul(k_s2pi, t); // √(2/π)*(...)
|
||||
}
|
||||
|
||||
FP32Vec8 inner_vec(inner);
|
||||
FP32Vec8 t = inner_vec.tanh(); // tanh part
|
||||
|
||||
FP32Vec8 one_vec(1.0f);
|
||||
FP32Vec8 half_vec(0.5f);
|
||||
|
||||
FP32Vec8 x_vec(*this);
|
||||
return x_vec * half_vec * (one_vec + t);
|
||||
}
|
||||
|
||||
// Erf-based GELU:
|
||||
// gelu(x) = 0.5 * x * (1 + erf(x / √2))
|
||||
FP32Vec8 gelu_erf() const {
|
||||
const __vector float inv_sqrt2 = vec_splats(0.7071067811865476f); // 1/√2
|
||||
FP32Vec8 x_vec(*this);
|
||||
|
||||
f32x4x2_t scaled;
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
scaled.val[i] = vec_mul(reg.val[i], inv_sqrt2);
|
||||
}
|
||||
FP32Vec8 x_scaled(scaled);
|
||||
|
||||
FP32Vec8 erf_x = x_scaled.er();
|
||||
|
||||
FP32Vec8 one_vec(1.0f);
|
||||
FP32Vec8 half_vec(0.5f);
|
||||
|
||||
return x_vec * half_vec * (one_vec + erf_x);
|
||||
}
|
||||
|
||||
// Elementwise reciprocal: 1/x (scalar per lane, for correctness)
|
||||
FP32Vec8 rcp() const {
|
||||
AliasReg in, out;
|
||||
in.reg = reg;
|
||||
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
out.values[i] = 1.0f / in.values[i];
|
||||
}
|
||||
return FP32Vec8(out.reg);
|
||||
}
|
||||
|
||||
// Elementwise rsqrt(x) = 1 / sqrt(x) (scalar per lane, for correctness)
|
||||
FP32Vec8 rsqrt() const {
|
||||
AliasReg in, out;
|
||||
in.reg = reg;
|
||||
|
||||
for (int i = 0; i < VEC_ELEM_NUM; ++i) {
|
||||
out.values[i] = 1.0f / std::sqrt(in.values[i]);
|
||||
}
|
||||
return FP32Vec8(out.reg);
|
||||
}
|
||||
|
||||
FP32Vec8 operator*(const FP32Vec8& b) const {
|
||||
@ -316,10 +524,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec16& v) {
|
||||
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
|
||||
reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
|
||||
reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
|
||||
reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
|
||||
// On big-endian s390x, place BF16 first to get correct byte order
|
||||
reg.val[0] = (__vector float)vec_mergeh(v.reg.val[0], zero);
|
||||
reg.val[1] = (__vector float)vec_mergel(v.reg.val[0], zero);
|
||||
reg.val[2] = (__vector float)vec_mergeh(v.reg.val[1], zero);
|
||||
reg.val[3] = (__vector float)vec_mergel(v.reg.val[1], zero);
|
||||
}
|
||||
|
||||
explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
|
||||
@ -376,6 +585,23 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
return result;
|
||||
}
|
||||
|
||||
FP32Vec16 max(const FP32Vec16& b) const {
|
||||
return FP32Vec16(f32x4x4_t({vec_max(reg.val[0], b.reg.val[0]),
|
||||
vec_max(reg.val[1], b.reg.val[1]),
|
||||
vec_max(reg.val[2], b.reg.val[2]),
|
||||
vec_max(reg.val[3], b.reg.val[3])}));
|
||||
}
|
||||
|
||||
float reduce_max() const {
|
||||
AliasReg ar;
|
||||
ar.reg = reg;
|
||||
float result = ar.values[0];
|
||||
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) {
|
||||
if (ar.values[i] > result) result = ar.values[i];
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
void save(float* ptr) const {
|
||||
vec_xst(reg.val[0], 0, ptr);
|
||||
vec_xst(reg.val[1], 16, ptr);
|
||||
@ -402,15 +628,14 @@ struct VecType<c10::BFloat16> {
|
||||
using vec_type = BF16Vec8;
|
||||
};
|
||||
|
||||
// On s390x, FP16 (Half) is not natively supported, use FP32 vectors instead
|
||||
using FP16Vec16 = FP32Vec16;
|
||||
|
||||
template <typename T>
|
||||
void storeFP32(float v, T* ptr) {
|
||||
*ptr = v;
|
||||
}
|
||||
|
||||
inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
|
||||
acc = acc + a * b;
|
||||
}
|
||||
|
||||
namespace c10 {
|
||||
struct BFloat16 {
|
||||
uint16_t value; // Assume BFloat16 is defined as a struct containing a 16-bit
|
||||
@ -429,6 +654,79 @@ inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
|
||||
#define __VEC_CLASS_FP_NAN (1 << 6)
|
||||
#endif
|
||||
|
||||
// Optimized FMA (Fused Multiply-Add) implementations using IBM Z vector
|
||||
// intrinsics
|
||||
|
||||
// FP32Vec4 FMA: acc = acc + (a * b) or equivalently acc = fma(a, b, acc)
|
||||
FORCE_INLINE void fma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
|
||||
acc.reg = vec_madd(a.reg, b.reg, acc.reg);
|
||||
}
|
||||
|
||||
// FP32Vec8 FMA: acc = acc + (a * b)
|
||||
FORCE_INLINE void fma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
|
||||
acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
}
|
||||
|
||||
// FP32Vec16 FMA: acc = acc + (a * b)
|
||||
FORCE_INLINE void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
|
||||
acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
acc.reg.val[2] = vec_madd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
|
||||
acc.reg.val[3] = vec_madd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
|
||||
}
|
||||
|
||||
// Multiply-Subtract: acc = acc - (a * b)
|
||||
FORCE_INLINE void fms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
|
||||
acc.reg = vec_msub(a.reg, b.reg, acc.reg);
|
||||
}
|
||||
|
||||
FORCE_INLINE void fms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
|
||||
acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
}
|
||||
|
||||
FORCE_INLINE void fms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
|
||||
acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
acc.reg.val[2] = vec_msub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
|
||||
acc.reg.val[3] = vec_msub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
|
||||
}
|
||||
|
||||
// Negative Multiply-Add: acc = -(a * b) + acc
|
||||
FORCE_INLINE void nfma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
|
||||
acc.reg = vec_nmadd(a.reg, b.reg, acc.reg);
|
||||
}
|
||||
|
||||
FORCE_INLINE void nfma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
|
||||
acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
}
|
||||
|
||||
FORCE_INLINE void nfma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
|
||||
acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
acc.reg.val[2] = vec_nmadd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
|
||||
acc.reg.val[3] = vec_nmadd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
|
||||
}
|
||||
|
||||
// Negative Multiply-Subtract: acc = -(a * b) - acc
|
||||
FORCE_INLINE void nfms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
|
||||
acc.reg = vec_nmsub(a.reg, b.reg, acc.reg);
|
||||
}
|
||||
|
||||
FORCE_INLINE void nfms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
|
||||
acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
}
|
||||
|
||||
FORCE_INLINE void nfms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
|
||||
acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
|
||||
acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
|
||||
acc.reg.val[2] = vec_nmsub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
|
||||
acc.reg.val[3] = vec_nmsub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
|
||||
}
|
||||
|
||||
const static __vector unsigned char omask = {2, 3, 6, 7, 10, 11, 14, 15,
|
||||
18, 19, 22, 23, 26, 27, 30, 31};
|
||||
const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
|
||||
@ -441,13 +739,24 @@ const static __vector unsigned int one = {1, 1, 1, 1};
|
||||
inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
|
||||
__vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
|
||||
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
|
||||
__vector unsigned int lsb0 = inp0 >> sh16;
|
||||
__vector unsigned int lsb1 = inp1 >> sh16;
|
||||
lsb0 = lsb0 & one;
|
||||
lsb1 = lsb1 & one;
|
||||
__vector unsigned int rnd0 = lsb0 + bias;
|
||||
__vector unsigned int rnd1 = lsb1 + bias;
|
||||
inp0 = inp0 + rnd0;
|
||||
inp1 = inp1 + rnd1;
|
||||
int cc;
|
||||
__vector __bool int sel0 =
|
||||
vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
|
||||
__vector __bool int sel1 =
|
||||
vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
|
||||
inp0 = vec_sel(inp0, nan, sel0) >> sh16;
|
||||
inp1 = vec_sel(inp1, nan, sel1) >> sh16;
|
||||
inp0 = vec_sel(inp0, nan, sel0);
|
||||
inp1 = vec_sel(inp1, nan, sel1);
|
||||
inp0 = inp0 >> sh16;
|
||||
inp1 = inp1 >> sh16;
|
||||
|
||||
reg = (__vector signed short)vec_perm(inp0, inp1, omask);
|
||||
}
|
||||
|
||||
@ -456,6 +765,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
|
||||
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
|
||||
__vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
|
||||
__vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
|
||||
__vector unsigned int lsb0 = inp0 >> sh16;
|
||||
__vector unsigned int lsb1 = inp1 >> sh16;
|
||||
__vector unsigned int lsb2 = inp2 >> sh16;
|
||||
__vector unsigned int lsb3 = inp3 >> sh16;
|
||||
lsb0 = lsb0 & one;
|
||||
lsb1 = lsb1 & one;
|
||||
lsb2 = lsb2 & one;
|
||||
lsb3 = lsb3 & one;
|
||||
__vector unsigned int rnd0 = lsb0 + bias;
|
||||
__vector unsigned int rnd1 = lsb1 + bias;
|
||||
__vector unsigned int rnd2 = lsb2 + bias;
|
||||
__vector unsigned int rnd3 = lsb3 + bias;
|
||||
inp0 = inp0 + rnd0;
|
||||
inp1 = inp1 + rnd1;
|
||||
inp2 = inp2 + rnd2;
|
||||
inp3 = inp3 + rnd3;
|
||||
int cc;
|
||||
__vector __bool int sel0 =
|
||||
vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
|
||||
@ -465,15 +790,164 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
|
||||
vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc);
|
||||
__vector __bool int sel3 =
|
||||
vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc);
|
||||
inp0 = vec_sel(inp0, nan, sel0) >> sh16;
|
||||
inp1 = vec_sel(inp1, nan, sel1) >> sh16;
|
||||
inp2 = vec_sel(inp2, nan, sel2) >> sh16;
|
||||
inp3 = vec_sel(inp3, nan, sel3) >> sh16;
|
||||
inp0 = vec_sel(inp0, nan, sel0);
|
||||
inp1 = vec_sel(inp1, nan, sel1);
|
||||
inp2 = vec_sel(inp2, nan, sel2);
|
||||
inp3 = vec_sel(inp3, nan, sel3);
|
||||
inp0 = inp0 >> sh16;
|
||||
inp1 = inp1 >> sh16;
|
||||
inp2 = inp2 >> sh16;
|
||||
inp3 = inp3 >> sh16;
|
||||
|
||||
reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
|
||||
reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
|
||||
}
|
||||
|
||||
inline void prefetch(const void* addr) { void __dcbt(const void* addr); }
|
||||
// 1D softmax over `n` elements in `input`, writes result to `output`.
|
||||
// Uses FP32Vec8 for main body, scalar tail handling.
|
||||
// Requirement: n > 0
|
||||
FORCE_INLINE void softmax_fp32vec8(float* output, const float* input, int n) {
|
||||
if (n <= 0) return;
|
||||
|
||||
// ---------- Pass 1: find max ----------
|
||||
float max_val = -std::numeric_limits<float>::infinity();
|
||||
int i = 0;
|
||||
|
||||
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
|
||||
FP32Vec8 v(input + i);
|
||||
FP32Vec8::AliasReg ar;
|
||||
ar.reg = v.reg;
|
||||
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
|
||||
if (ar.values[j] > max_val) max_val = ar.values[j];
|
||||
}
|
||||
}
|
||||
for (; i < n; ++i) {
|
||||
if (input[i] > max_val) max_val = input[i];
|
||||
}
|
||||
|
||||
// ---------- Pass 2: compute exp(x - max) and sum ----------
|
||||
float sum = 0.0f;
|
||||
i = 0;
|
||||
|
||||
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
|
||||
float tmp[FP32Vec8::VEC_ELEM_NUM];
|
||||
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
|
||||
tmp[j] = input[i + j] - max_val;
|
||||
}
|
||||
|
||||
FP32Vec8 v(tmp);
|
||||
FP32Vec8 e = v.exp();
|
||||
|
||||
FP32Vec8::AliasReg ar;
|
||||
ar.reg = e.reg;
|
||||
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
|
||||
output[i + j] = ar.values[j];
|
||||
sum += ar.values[j];
|
||||
}
|
||||
}
|
||||
|
||||
// Tail
|
||||
for (; i < n; ++i) {
|
||||
float x = input[i] - max_val;
|
||||
float ex = std::exp(x); // scalar tail
|
||||
output[i] = ex;
|
||||
sum += ex;
|
||||
}
|
||||
|
||||
// ---------- Pass 3: normalize ----------
|
||||
float inv_sum = 1.0f / sum;
|
||||
i = 0;
|
||||
|
||||
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
|
||||
float tmp[FP32Vec8::VEC_ELEM_NUM];
|
||||
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
|
||||
tmp[j] = output[i + j] * inv_sum;
|
||||
}
|
||||
FP32Vec8 v(tmp);
|
||||
v.save(output + i);
|
||||
}
|
||||
|
||||
for (; i < n; ++i) {
|
||||
output[i] *= inv_sum;
|
||||
}
|
||||
}
|
||||
|
||||
// 1D RMSNorm kernel:
|
||||
// input: x[0..n-1]
|
||||
// weight: w[0..n-1] (gamma), may be nullptr
|
||||
// output: y[i] = x[i] * inv_rms * (weight[i] if weight != nullptr else 1)
|
||||
// eps: small epsilon for numerical stability
|
||||
FORCE_INLINE void rmsnorm_fp32vec8(float* output, const float* input,
|
||||
const float* weight, int n, float eps) {
|
||||
if (n <= 0) return;
|
||||
|
||||
// ---------- Pass 1: compute sum of squares ----------
|
||||
float sum_sq = 0.0f;
|
||||
int i = 0;
|
||||
|
||||
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
|
||||
FP32Vec8 x_vec(input + i);
|
||||
|
||||
FP32Vec8 sq = x_vec * x_vec;
|
||||
|
||||
FP32Vec8::AliasReg ar;
|
||||
ar.reg = sq.reg;
|
||||
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
|
||||
sum_sq += ar.values[j];
|
||||
}
|
||||
}
|
||||
|
||||
// Tail
|
||||
for (; i < n; ++i) {
|
||||
float v = input[i];
|
||||
sum_sq += v * v;
|
||||
}
|
||||
|
||||
float mean_sq = sum_sq / static_cast<float>(n);
|
||||
float inv_rms = 1.0f / std::sqrt(mean_sq + eps);
|
||||
|
||||
// ---------- Pass 2: scale (and apply weight if given) ----------
|
||||
const float inv_rms_f = inv_rms;
|
||||
i = 0;
|
||||
|
||||
if (weight) {
|
||||
// with gamma
|
||||
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
|
||||
FP32Vec8 x_vec(input + i);
|
||||
|
||||
float wtmp[FP32Vec8::VEC_ELEM_NUM];
|
||||
for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
|
||||
wtmp[j] = weight[i + j];
|
||||
}
|
||||
FP32Vec8 w_vec(wtmp);
|
||||
|
||||
FP32Vec8 scale_vec(inv_rms_f);
|
||||
FP32Vec8 y = x_vec * scale_vec * w_vec;
|
||||
y.save(output + i);
|
||||
}
|
||||
|
||||
for (; i < n; ++i) {
|
||||
output[i] = input[i] * inv_rms_f * weight[i];
|
||||
}
|
||||
} else {
|
||||
// without gamma
|
||||
for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
|
||||
FP32Vec8 x_vec(input + i);
|
||||
FP32Vec8 scale_vec(inv_rms_f);
|
||||
FP32Vec8 y = x_vec * scale_vec;
|
||||
y.save(output + i);
|
||||
}
|
||||
|
||||
for (; i < n; ++i) {
|
||||
output[i] = input[i] * inv_rms_f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Prefetch data to cache for better memory access performance
|
||||
FORCE_INLINE void prefetch(const void* addr) {
|
||||
__builtin_prefetch(addr, 0, 3); // 0=read, 3=high temporal locality
|
||||
}
|
||||
|
||||
}; // namespace vec_op
|
||||
|
||||
|
||||
@ -489,14 +489,16 @@ __global__ void Marlin(
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int idx = tid4 * 4 + i;
|
||||
idx = idx < block_num_valid_tokens ? idx : 0;
|
||||
if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
|
||||
sh_block_topk_weights[idx] = __hmul2(
|
||||
global_scale, Dtype::num2num2(Dtype::float2num(
|
||||
topk_weights_ptr[sh_block_sorted_ids[idx]])));
|
||||
} else {
|
||||
sh_block_topk_weights[idx] = Dtype::num2num2(
|
||||
Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
|
||||
if (idx < block_num_valid_tokens) {
|
||||
if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
|
||||
sh_block_topk_weights[idx] =
|
||||
__hmul2(global_scale,
|
||||
Dtype::num2num2(Dtype::float2num(
|
||||
topk_weights_ptr[sh_block_sorted_ids[idx]])));
|
||||
} else {
|
||||
sh_block_topk_weights[idx] = Dtype::num2num2(
|
||||
Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -52,14 +52,13 @@ void paged_attention_v2(
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
const int64_t blocksparse_head_sliding_step);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
void merge_attn_states(torch::Tensor& output,
|
||||
std::optional<torch::Tensor> output_lse,
|
||||
const torch::Tensor& prefix_output,
|
||||
const torch::Tensor& prefix_lse,
|
||||
const torch::Tensor& suffix_output,
|
||||
const torch::Tensor& suffix_lse);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
void convert_vertical_slash_indexes(
|
||||
torch::Tensor& block_count, // [BATCH, N_HEADS, NUM_ROWS]
|
||||
torch::Tensor& block_offset, // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
|
||||
|
||||
@ -22,6 +22,7 @@
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include "cutlass_extensions/common.hpp"
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cutlass/tensor_ref.h"
|
||||
@ -173,7 +174,7 @@ void run_get_group_gemm_starts(
|
||||
}
|
||||
|
||||
template <typename OutType>
|
||||
void run_fp4_blockwise_scaled_group_mm(
|
||||
void run_fp4_blockwise_scaled_group_mm_sm100(
|
||||
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
|
||||
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
|
||||
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
|
||||
@ -343,17 +344,225 @@ void run_fp4_blockwise_scaled_group_mm(
|
||||
|
||||
auto can_implement_status = gemm_op.can_implement(args);
|
||||
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
|
||||
"Failed to implement GEMM");
|
||||
"Failed to implement GEMM: status=", (int)can_implement_status);
|
||||
|
||||
// Run the GEMM
|
||||
auto status = gemm_op.initialize(args, workspace.data_ptr());
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess,
|
||||
"Failed to initialize GEMM: status=", (int)status,
|
||||
" workspace_size=", workspace_size, " num_experts=", num_experts,
|
||||
" M=", M, " N=", N, " K=", K);
|
||||
|
||||
status = gemm_op.run(args, workspace.data_ptr(), stream);
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
|
||||
}
|
||||
|
||||
void run_fp4_blockwise_scaled_group_mm_sm120(
|
||||
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
|
||||
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
|
||||
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
|
||||
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
|
||||
int N, int K) {
|
||||
using ProblemShape =
|
||||
cutlass::gemm::GroupProblemShape<Shape<int32_t, int32_t, int32_t>>;
|
||||
using ElementType = cutlass::float_e2m1_t;
|
||||
using ElementSFType = cutlass::float_ue4m3_t;
|
||||
using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
|
||||
|
||||
// NOTE: For SM120 it seems templating the output type is not supported and
|
||||
// we need to hardcode the output type to bfloat16
|
||||
using ElementC = cutlass::bfloat16_t;
|
||||
using ElementD = ElementC;
|
||||
using ElementAccumulator = float;
|
||||
// Layout definitions
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
using LayoutC = cutlass::layout::RowMajor;
|
||||
using LayoutD = LayoutC;
|
||||
|
||||
// Alignment constraints
|
||||
static constexpr int AlignmentA = 32;
|
||||
static constexpr int AlignmentB = 32;
|
||||
static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
|
||||
static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
|
||||
|
||||
// Architecture definitions
|
||||
using ArchTag = cutlass::arch::Sm120;
|
||||
using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
|
||||
|
||||
using ClusterShape = Shape<_1, _1, _1>;
|
||||
using MmaTileShape = Shape<_128, _128, _128>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
|
||||
ElementD, ElementAccumulator, ElementC, ElementAccumulator>;
|
||||
|
||||
using CollectiveEpilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass, MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
|
||||
ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
|
||||
LayoutD*, AlignmentD,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass, ElementA, LayoutA*, AlignmentA, ElementB,
|
||||
LayoutB*, AlignmentB, ElementAccumulator, MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
|
||||
sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
|
||||
|
||||
using GemmKernel =
|
||||
cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
|
||||
CollectiveEpilogue>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
using StrideA = typename Gemm::GemmKernel::InternalStrideA;
|
||||
using StrideB = typename Gemm::GemmKernel::InternalStrideB;
|
||||
using StrideC = typename Gemm::GemmKernel::InternalStrideC;
|
||||
using StrideD = typename Gemm::GemmKernel::InternalStrideD;
|
||||
|
||||
using LayoutSFA =
|
||||
typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
|
||||
using LayoutSFB =
|
||||
typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
|
||||
using ScaleConfig =
|
||||
typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
|
||||
|
||||
using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
|
||||
int num_experts = static_cast<int>(expert_offsets.size(0));
|
||||
auto options_int =
|
||||
torch::TensorOptions().dtype(torch::kInt64).device(a.device());
|
||||
|
||||
torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
|
||||
torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
|
||||
torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
|
||||
torch::Tensor c_strides1 =
|
||||
torch::full({num_experts}, output.stride(0), options_int);
|
||||
torch::Tensor a_strides1 =
|
||||
torch::full({num_experts}, a.stride(0) * 2, options_int);
|
||||
torch::Tensor b_strides1 =
|
||||
torch::full({num_experts}, b.stride(1) * 2, options_int);
|
||||
|
||||
run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
|
||||
a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
|
||||
layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
|
||||
expert_offsets, sf_offsets, problem_sizes, M, N, K);
|
||||
|
||||
// Create an instance of the GEMM
|
||||
Gemm gemm_op;
|
||||
|
||||
// Initialize problem_sizes_as_shapes correctly
|
||||
UnderlyingProblemShape* problem_sizes_as_shapes =
|
||||
static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
|
||||
|
||||
// Set the Scheduler info
|
||||
cutlass::KernelHardwareInfo hw_info;
|
||||
using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
|
||||
typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
|
||||
scheduler.raster_order = RasterOrderOptions::AlongM;
|
||||
hw_info.device_id = a.get_device();
|
||||
static std::unordered_map<int, int> cached_sm_counts;
|
||||
if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) {
|
||||
cached_sm_counts[hw_info.device_id] =
|
||||
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
|
||||
hw_info.device_id);
|
||||
}
|
||||
hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX);
|
||||
|
||||
// Mainloop Arguments
|
||||
typename GemmKernel::MainloopArguments mainloop_args{
|
||||
static_cast<const ElementType**>(a_ptrs.data_ptr()),
|
||||
static_cast<StrideA*>(a_strides1.data_ptr()),
|
||||
static_cast<const ElementType**>(b_ptrs.data_ptr()),
|
||||
static_cast<StrideB*>(b_strides1.data_ptr()),
|
||||
static_cast<const ElementSFType**>(a_scales_ptrs.data_ptr()),
|
||||
reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
|
||||
static_cast<const ElementSFType**>(b_scales_ptrs.data_ptr()),
|
||||
reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())};
|
||||
|
||||
// Epilogue Arguments
|
||||
typename GemmKernel::EpilogueArguments epilogue_args{
|
||||
{}, // epilogue.thread
|
||||
nullptr,
|
||||
static_cast<StrideC*>(c_strides1.data_ptr()),
|
||||
static_cast<ElementD**>(out_ptrs.data_ptr()),
|
||||
static_cast<StrideC*>(c_strides1.data_ptr())};
|
||||
auto& fusion_args = epilogue_args.thread;
|
||||
fusion_args.alpha_ptr_array =
|
||||
reinterpret_cast<float**>(alpha_ptrs.data_ptr());
|
||||
fusion_args.dAlpha = {_0{}, _0{}, 1};
|
||||
fusion_args.beta = 0.0f;
|
||||
|
||||
// Gemm Arguments
|
||||
typename GemmKernel::Arguments args{
|
||||
cutlass::gemm::GemmUniversalMode::kGrouped,
|
||||
{num_experts, problem_sizes_as_shapes, nullptr},
|
||||
mainloop_args,
|
||||
epilogue_args,
|
||||
hw_info,
|
||||
scheduler};
|
||||
|
||||
size_t workspace_size = Gemm::get_workspace_size(args);
|
||||
auto const workspace_options =
|
||||
torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
|
||||
auto workspace = torch::empty(workspace_size, workspace_options);
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device());
|
||||
|
||||
auto can_implement_status = gemm_op.can_implement(args);
|
||||
TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
|
||||
"Failed to implement GEMM: status=", (int)can_implement_status);
|
||||
|
||||
// Run the GEMM
|
||||
auto status = gemm_op.initialize(args, workspace.data_ptr());
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess,
|
||||
"Failed to initialize GEMM: status=", (int)status,
|
||||
" workspace_size=", workspace_size, " num_experts=", num_experts,
|
||||
" M=", M, " N=", N, " K=", K);
|
||||
|
||||
status = gemm_op.run(args, workspace.data_ptr(), stream);
|
||||
TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
|
||||
}
|
||||
|
||||
template <typename OutType>
|
||||
void run_fp4_blockwise_scaled_group_mm(
|
||||
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
|
||||
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
|
||||
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
|
||||
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
|
||||
int N, int K) {
|
||||
int32_t version_num = get_sm_version_num();
|
||||
#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
|
||||
if (version_num >= 120 && version_num < 130) {
|
||||
run_fp4_blockwise_scaled_group_mm_sm120(
|
||||
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
|
||||
expert_offsets, sf_offsets, M, N, K);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
|
||||
if (version_num >= 100 && version_num < 120) {
|
||||
run_fp4_blockwise_scaled_group_mm_sm100<OutType>(
|
||||
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
|
||||
expert_offsets, sf_offsets, M, N, K);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false,
|
||||
"No compiled cutlass_fp4_group_mm kernel for CUDA device capability: ",
|
||||
version_num, ". Required capability: 100 or 120");
|
||||
}
|
||||
|
||||
#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
|
||||
(defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
|
||||
constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
|
||||
constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
|
||||
#endif
|
||||
@ -374,7 +583,8 @@ void cutlass_fp4_group_mm(
|
||||
const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
|
||||
const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
|
||||
const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
|
||||
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
|
||||
#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
|
||||
(defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
|
||||
// Input validation
|
||||
CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
|
||||
CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
|
||||
@ -408,6 +618,14 @@ void cutlass_fp4_group_mm(
|
||||
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
|
||||
expert_offsets, sf_offsets, M, N, K);
|
||||
} else {
|
||||
#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
|
||||
int32_t version_num = get_sm_version_num();
|
||||
if (version_num >= 120 && version_num < 130) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false, "SM120 NVFP4 MOE only supports bfloat16 output, got: ",
|
||||
output.scalar_type());
|
||||
}
|
||||
#endif
|
||||
run_fp4_blockwise_scaled_group_mm<cutlass::half_t>(
|
||||
output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
|
||||
expert_offsets, sf_offsets, M, N, K);
|
||||
@ -416,8 +634,8 @@ void cutlass_fp4_group_mm(
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false,
|
||||
"No compiled cutlass_fp4_group_mm kernel, vLLM must "
|
||||
"be compiled with ENABLE_NVFP4_SM100 for SM100+ and CUDA "
|
||||
"12.8 or above.");
|
||||
"be compiled with ENABLE_NVFP4_SM100 or ENABLE_NVFP4_SM120 for SM100/120 "
|
||||
"and CUDA 12.8 or above.");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -307,7 +307,7 @@ constexpr auto FLOAT = at::ScalarType::Float;
|
||||
constexpr auto INT = at::ScalarType::Int;
|
||||
constexpr auto UINT8 = at::ScalarType::Byte;
|
||||
|
||||
void scaled_fp4_experts_quant_sm100a(
|
||||
void scaled_fp4_experts_quant_sm1xxa(
|
||||
torch::Tensor& output, torch::Tensor& output_scale,
|
||||
torch::Tensor const& input, torch::Tensor const& input_global_scale,
|
||||
torch::Tensor const& input_offset_by_experts,
|
||||
|
||||
@ -24,8 +24,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||
torch::Tensor const& input_sf);
|
||||
#endif
|
||||
|
||||
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
|
||||
void scaled_fp4_experts_quant_sm100a(
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||
void scaled_fp4_experts_quant_sm1xxa(
|
||||
torch::Tensor& output, torch::Tensor& output_scale,
|
||||
torch::Tensor const& input, torch::Tensor const& input_global_scale,
|
||||
torch::Tensor const& input_offset_by_experts,
|
||||
@ -54,8 +55,9 @@ void scaled_fp4_experts_quant(
|
||||
torch::Tensor const& input, torch::Tensor const& input_global_scale,
|
||||
torch::Tensor const& input_offset_by_experts,
|
||||
torch::Tensor const& output_scale_offset_by_experts) {
|
||||
#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
|
||||
return scaled_fp4_experts_quant_sm100a(
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||
return scaled_fp4_experts_quant_sm1xxa(
|
||||
output, output_scale, input, input_global_scale, input_offset_by_experts,
|
||||
output_scale_offset_by_experts);
|
||||
#endif
|
||||
|
||||
@ -67,9 +67,9 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
|
||||
std::optional<torch::Tensor> const& bias);
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 || \
|
||||
defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
|
||||
defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
|
||||
#if (defined(ENABLE_CUTLASS_MOE_SM90) && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100) || \
|
||||
(defined(ENABLE_CUTLASS_MOE_SM120) && ENABLE_CUTLASS_MOE_SM120)
|
||||
void get_cutlass_moe_mm_data_caller(
|
||||
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
|
||||
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
|
||||
@ -284,8 +284,9 @@ void get_cutlass_moe_mm_data(
|
||||
// This function currently gets compiled only if we have a valid cutlass moe
|
||||
// mm to run it for.
|
||||
int32_t version_num = get_sm_version_num();
|
||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
|
||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
|
||||
get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
|
||||
problem_sizes2, input_permutation,
|
||||
output_permutation, num_experts, n, k,
|
||||
@ -296,7 +297,7 @@ void get_cutlass_moe_mm_data(
|
||||
false,
|
||||
"No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
|
||||
"CUDA device capability: ",
|
||||
version_num, ". Required capability: 90 or 100");
|
||||
version_num, ". Required capability: 90, 100, or 120");
|
||||
}
|
||||
|
||||
void get_cutlass_moe_mm_problem_sizes(
|
||||
@ -304,8 +305,9 @@ void get_cutlass_moe_mm_problem_sizes(
|
||||
torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
|
||||
const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
|
||||
int32_t version_num = get_sm_version_num();
|
||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
|
||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
|
||||
get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
|
||||
problem_sizes2, num_experts, n, k,
|
||||
blockscale_offsets);
|
||||
@ -315,7 +317,7 @@ void get_cutlass_moe_mm_problem_sizes(
|
||||
false,
|
||||
"No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
|
||||
"kernel for CUDA device capability: ",
|
||||
version_num, ". Required capability: 90 or 100");
|
||||
version_num, ". Required capability: 90, 100, or 120");
|
||||
}
|
||||
|
||||
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
|
||||
@ -328,8 +330,9 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
|
||||
// This function currently gets compiled only if we have a valid cutlass moe
|
||||
// mm to run it for.
|
||||
int32_t version_num = get_sm_version_num();
|
||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
|
||||
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
|
||||
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
|
||||
get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
|
||||
problem_sizes2, expert_num_tokens,
|
||||
num_local_experts, padded_m, n, k);
|
||||
@ -339,7 +342,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
|
||||
false,
|
||||
"No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
|
||||
"for CUDA device capability: ",
|
||||
version_num, ". Required capability: 90 or 100");
|
||||
version_num, ". Required capability: 90, 100, or 120");
|
||||
}
|
||||
|
||||
void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
|
||||
|
||||
@ -63,7 +63,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" int blocksparse_head_sliding_step) -> ()");
|
||||
ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
// Merge attn states
|
||||
// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
|
||||
// can be used to combine partial attention results (in the split-KV case)
|
||||
@ -76,7 +75,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" Tensor suffix_output,"
|
||||
" Tensor suffix_lse) -> ()");
|
||||
ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
ops.def(
|
||||
"convert_vertical_slash_indexes("
|
||||
" Tensor! block_count, Tensor! block_offset, "
|
||||
@ -695,7 +694,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
cache_ops.def(
|
||||
"gather_and_maybe_dequant_cache(Tensor src_cache, Tensor! dst, "
|
||||
" Tensor block_table, Tensor cu_seq_lens, "
|
||||
" int batch_size, "
|
||||
" Tensor token_to_seq, "
|
||||
" int num_tokens, "
|
||||
" str kv_cache_dtype, "
|
||||
" Tensor scale, Tensor? seq_starts) -> ()");
|
||||
cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,
|
||||
|
||||
@ -20,8 +20,8 @@ ARG PYTHON_VERSION=3.12
|
||||
# glibc version is baked into the distro, and binaries built with one glibc
|
||||
# version are not backwards compatible with OSes that use an earlier version.
|
||||
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
|
||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||
# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
|
||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04
|
||||
|
||||
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
|
||||
# their own mirror. When doing so, we don't benefit from the transparent
|
||||
@ -85,7 +85,20 @@ ARG GET_PIP_URL
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl sudo python3-pip libibverbs-dev \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
ccache \
|
||||
software-properties-common \
|
||||
git \
|
||||
curl \
|
||||
sudo \
|
||||
python3-pip \
|
||||
libibverbs-dev \
|
||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||
# as it was causing spam when compiling the CUTLASS kernels
|
||||
gcc-10 \
|
||||
g++-10 \
|
||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
|
||||
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
|
||||
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
|
||||
@ -110,10 +123,6 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||
# as it was causing spam when compiling the CUTLASS kernels
|
||||
RUN apt-get install -y gcc-10 g++-10
|
||||
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
|
||||
RUN <<EOF
|
||||
gcc --version
|
||||
EOF
|
||||
@ -235,9 +244,15 @@ RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
|
||||
|
||||
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
|
||||
# Install EP kernels(pplx-kernels and DeepEP)
|
||||
ARG PPLX_COMMIT_HASH
|
||||
ARG DEEPEP_COMMIT_HASH
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
|
||||
/tmp/install_python_libraries.sh /tmp/ep_kernels_workspace wheel && \
|
||||
/tmp/install_python_libraries.sh \
|
||||
--workspace /tmp/ep_kernels_workspace \
|
||||
--mode wheel \
|
||||
${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
|
||||
${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
|
||||
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
|
||||
|
||||
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
||||
@ -268,7 +283,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
# Install libnuma-dev, required by fastsafetensors (fixes #20384)
|
||||
RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*
|
||||
COPY requirements/lint.txt requirements/lint.txt
|
||||
COPY requirements/test.txt requirements/test.txt
|
||||
COPY requirements/dev.txt requirements/dev.txt
|
||||
@ -305,8 +320,15 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y software-properties-common curl sudo python3-pip \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
software-properties-common \
|
||||
curl \
|
||||
sudo \
|
||||
python3-pip \
|
||||
ffmpeg \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
libgl1 \
|
||||
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
|
||||
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
|
||||
mkdir -p -m 0755 /etc/apt/keyrings ; \
|
||||
@ -321,13 +343,30 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
done ; \
|
||||
fi \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
python${PYTHON_VERSION} \
|
||||
python${PYTHON_VERSION}-dev \
|
||||
python${PYTHON_VERSION}-venv \
|
||||
libibverbs-dev \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
# Install CUDA development tools and build essentials for runtime JIT compilation
|
||||
# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
|
||||
RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
|
||||
apt-get update -y && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
cuda-nvcc-${CUDA_VERSION_DASH} \
|
||||
cuda-cudart-${CUDA_VERSION_DASH} \
|
||||
cuda-nvrtc-${CUDA_VERSION_DASH} \
|
||||
cuda-cuobjdump-${CUDA_VERSION_DASH} \
|
||||
libcublas-${CUDA_VERSION_DASH} && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ARG PIP_INDEX_URL UV_INDEX_URL
|
||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
@ -359,8 +398,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
||||
# Install FlashInfer pre-compiled kernel cache and binaries
|
||||
# https://docs.flashinfer.ai/installation.html
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system flashinfer-cubin==0.5.2 \
|
||||
&& uv pip install --system flashinfer-jit-cache==0.5.2 \
|
||||
uv pip install --system flashinfer-cubin==0.5.3 \
|
||||
&& uv pip install --system flashinfer-jit-cache==0.5.3 \
|
||||
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
||||
&& flashinfer show-config
|
||||
|
||||
|
||||
@ -119,7 +119,6 @@ FROM base AS vllm-test-deps
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
|
||||
# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
|
||||
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
||||
cp requirements/test.in requirements/cpu-test.in && \
|
||||
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
|
||||
@ -132,9 +131,6 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
|
||||
esac; \
|
||||
}; \
|
||||
remove_packages_not_supported_on_aarch64 && \
|
||||
sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
|
||||
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
|
||||
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
|
||||
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
|
||||
@ -76,34 +76,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/common.txt
|
||||
|
||||
# must put before installing xformers, so it can install the correct version of xfomrers.
|
||||
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
|
||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||
|
||||
# Build xformers with cuda and torch nightly
|
||||
# following official xformers guidance: https://github.com/facebookresearch/xformers#build
|
||||
# todo(elainewy): cache xformers build result for faster build
|
||||
ARG max_jobs=16
|
||||
ENV MAX_JOBS=${max_jobs}
|
||||
ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
|
||||
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
echo 'git clone xformers...' \
|
||||
&& git clone https://github.com/facebookresearch/xformers.git --recursive \
|
||||
&& cd xformers \
|
||||
&& git checkout ${XFORMERS_COMMIT} \
|
||||
&& git submodule update --init --recursive \
|
||||
&& echo 'finish git clone xformers...' \
|
||||
&& rm -rf build \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
|
||||
&& cd .. \
|
||||
&& rm -rf xformers
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system xformers-dist/*.whl --verbose
|
||||
|
||||
# build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
|
||||
# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
|
||||
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
|
||||
@ -233,11 +205,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system vllm-dist/*.whl --verbose
|
||||
|
||||
# install xformers again for the new environment
|
||||
RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
|
||||
|
||||
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
|
||||
|
||||
# install package for build flashinfer
|
||||
@ -307,7 +274,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/nightly_torch_test.txt
|
||||
|
||||
# Logging to confirm the torch versions
|
||||
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
|
||||
RUN pip freeze | grep -E 'torch|vllm|flashinfer'
|
||||
|
||||
# Logging to confirm all the packages are installed
|
||||
RUN pip freeze
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
|
||||
FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
|
||||
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||
@ -25,10 +25,14 @@ RUN apt clean && apt-get update -y && \
|
||||
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
|
||||
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
|
||||
|
||||
RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing
|
||||
RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc
|
||||
|
||||
# This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
|
||||
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.6/intel-oneccl-2021.15.6.9_offline.sh
|
||||
RUN bash intel-oneccl-2021.15.6.9_offline.sh -a --silent --eula accept && \
|
||||
echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \
|
||||
echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc
|
||||
|
||||
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
|
||||
RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
|
||||
SHELL ["bash", "-c"]
|
||||
CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
|
||||
|
||||
@ -72,6 +76,7 @@ RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
ENV NIXL_VERSION=0.7.0
|
||||
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
|
||||
|
||||
# remove torch bundled oneccl to avoid conflicts
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip uninstall oneccl oneccl-devel -y
|
||||
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 131 KiB After Width: | Height: | Size: 146 KiB |
@ -10,6 +10,7 @@ Stay tuned for upcoming meetups! Follow us on [Twitter/X](https://x.com/vllm_pro
|
||||
|
||||
Below you'll find slides and recordings from our previous meetups:
|
||||
|
||||
- [vLLM Bangkok Meetup](https://luma.com/v0f647nv), November 21st 2025. [[Slides]](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing)
|
||||
- [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
|
||||
- [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link)
|
||||
- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6)
|
||||
|
||||
@ -98,21 +98,6 @@ to warm it up so that future builds are faster.
|
||||
<img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
|
||||
</p>
|
||||
|
||||
## Update dependencies
|
||||
|
||||
Several vLLM dependencies like xFormers depend on PyTorch and need
|
||||
to be updated accordingly. Rather than waiting for all of them to publish new
|
||||
releases (which would take too much time), they can be built from
|
||||
source to unblock the update process.
|
||||
|
||||
### xFormers
|
||||
|
||||
```bash
|
||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
|
||||
MAX_JOBS=16 uv pip install --system \
|
||||
--no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
|
||||
```
|
||||
|
||||
## Update all the different vLLM platforms
|
||||
|
||||
Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable
|
||||
|
||||
@ -29,7 +29,7 @@ The initialization code should look like this:
|
||||
```python
|
||||
from torch import nn
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.attention import Attention
|
||||
from vllm.attention.layer import Attention
|
||||
|
||||
class MyAttention(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
|
||||
@ -2,4 +2,4 @@
|
||||
|
||||
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
|
||||
|
||||
Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
|
||||
Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.
|
||||
|
||||
@ -84,12 +84,14 @@ See the following figures for a quick comparison between the previous and curren
|
||||
```python
|
||||
class BatchDescriptor(NamedTuple):
|
||||
num_tokens: int
|
||||
uniform_decode: bool = False
|
||||
num_reqs: int
|
||||
uniform: bool = False
|
||||
has_lora: bool = False
|
||||
```
|
||||
|
||||
where `num_tokens` can be the padded token length, and `uniform_decode` is determined by if `max_query_len` of a batch is equal to the desired `max_query_len` of a uniform_decode, and the num_scheduled_tokens is divisible by that desired `max_query_len`.
|
||||
where `num_tokens` can be the padded token length, and `uniform` indicates if all the requests have the same query lengths. Many attention backends only support full cudagraphs when the batches are uniform; pure decode batches are uniform but may not be query length 1 (i.e. `num_tokens == num_reqs`), this occurs in the validation pass of spec-decode where "decode" batches will have a query length of `1+num_spec_tokens`.
|
||||
|
||||
The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
|
||||
The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item.
|
||||
|
||||
!!! note
|
||||
The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
|
||||
|
||||
@ -151,6 +151,76 @@ To avoid this, please either:
|
||||
2. wrap the branching logic into a custom operator. TorchDynamo does not
|
||||
trace into custom operators.
|
||||
|
||||
## Debugging constraint violations and dynamic shapes guards issues
|
||||
|
||||
Dynamic-shape guards are a specific category of Dynamo guards. They are constraints that `torch.compile`
|
||||
attaches to dynamic dimensions (e.g., `seq_len`) to ensure the compiled artifact remains valid.
|
||||
These guards typically appear when framework code, custom passes, or user code branches based on
|
||||
dynamic shape values.
|
||||
|
||||
**Example:**
|
||||
|
||||
```python
|
||||
if x > 10:
|
||||
# path A
|
||||
else:
|
||||
# path B
|
||||
```
|
||||
|
||||
This creates a guard `x > 10` or `x <= 10` depending on which path was traced.
|
||||
|
||||
**vLLM's Assumption:**
|
||||
vLLM assumes that all guards added by torch.compile are safe to drop and will not
|
||||
constrain the compiled graph to specific input shapes. When this assumption is violated,
|
||||
it can cause issues that users need to debug.
|
||||
Some side effects that indicates this assumption is violated are runtime errors
|
||||
or `ConstraintViolationErrors`.
|
||||
|
||||
A `ConstraintViolationErrors` will be thrown if a dynamic shape gets constrained to
|
||||
a single value. If you encounter a constraint violation error or suspect that a dynamic
|
||||
shapes guard is being added incorrectly, you can use stricter dynamic shape modes to
|
||||
help debug the issue:
|
||||
|
||||
```sh
|
||||
# Online - using unbacked mode
|
||||
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
|
||||
|
||||
# Online - using backed_size_oblivious mode
|
||||
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=backed_size_oblivious
|
||||
```
|
||||
|
||||
```py
|
||||
# Offline - using unbacked mode
|
||||
from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
|
||||
LLM(model, compilation_config=CompilationConfig(
|
||||
dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.UNBACKED)
|
||||
))
|
||||
|
||||
# Offline - using backed_size_oblivious mode
|
||||
from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
|
||||
LLM(model, compilation_config=CompilationConfig(
|
||||
dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS)
|
||||
))
|
||||
```
|
||||
|
||||
These modes are stricter and reduce or eliminate the need of dynamic shapes guarding, which can help isolate issues:
|
||||
|
||||
- `unbacked`: Uses unbacked symints which don't allow guards, making it easier to identify where guards are being incorrectly added
|
||||
- `backed_size_oblivious`: Uses a mode that is more strict about guarding.
|
||||
|
||||
For more details on dynamic shapes modes, see [Dynamic shapes and vLLM guard dropping](torch_compile.md#dynamic-shapes-and-vllm-guard-dropping).
|
||||
|
||||
### Printing guards
|
||||
|
||||
To see all guards that are being added during compilation, you can use `TORCH_LOGS=+dynamic`:
|
||||
|
||||
```sh
|
||||
TORCH_LOGS=+dynamic vllm serve meta-llama/Llama-3.2-1B
|
||||
```
|
||||
|
||||
Look for `[guard added]` in the logs to see where guards are being added. This can help you identify which operations are
|
||||
causing guards to be added incorrectly.
|
||||
|
||||
## Debugging TorchInductor
|
||||
|
||||
TorchInductor takes a captured graph and then compiles it down to some Python code
|
||||
|
||||
@ -60,7 +60,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
|
||||
|
||||
- [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
|
||||
- [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
|
||||
- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod]
|
||||
- [`CompressedTensorsW4A4Nvfp4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoeMethod]
|
||||
- [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
|
||||
- [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
|
||||
- [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
|
||||
|
||||
69
docs/design/optimization_levels.md
Normal file
69
docs/design/optimization_levels.md
Normal file
@ -0,0 +1,69 @@
|
||||
<!-- markdownlint-disable -->
|
||||
|
||||
# Optimization Levels
|
||||
|
||||
## Overview
|
||||
|
||||
vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechnaism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out of the box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
|
||||
|
||||
## Level Summaries and Usage Examples
|
||||
```bash
|
||||
# CLI usage
|
||||
python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O0
|
||||
|
||||
# Python API usage
|
||||
from vllm.entrypoints.llm import LLM
|
||||
|
||||
llm = LLM(
|
||||
model="RedHatAI/Llama-3.2-1B-FP8",
|
||||
optimization_level=0
|
||||
)
|
||||
```
|
||||
|
||||
#### `-O1`: Quick Optimizations
|
||||
- **Startup**: Moderate startup time
|
||||
- **Performance**: Inductor compilation, CUDAGraphMode.PIECEWISE
|
||||
- **Use case**: Balance for most development scenarios
|
||||
|
||||
```bash
|
||||
# CLI usage
|
||||
python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O1
|
||||
|
||||
# Python API usage
|
||||
from vllm.entrypoints.llm import LLM
|
||||
|
||||
llm = LLM(
|
||||
model="RedHatAI/Llama-3.2-1B-FP8",
|
||||
optimization_level=1
|
||||
)
|
||||
```
|
||||
|
||||
#### `-O2`: Full Optimizations (Default)
|
||||
- **Startup**: Longer startup time
|
||||
- **Performance**: `-O1` + CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
- **Use case**: Production workloads where performance is important. This is the default use case. It is also very similar to the previous default. The primary difference is that noop & fusion flags are enabled.
|
||||
|
||||
```bash
|
||||
# CLI usage (default, so optional)
|
||||
python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O2
|
||||
|
||||
# Python API usage
|
||||
from vllm.entrypoints.llm import LLM
|
||||
|
||||
llm = LLM(
|
||||
model="RedHatAI/Llama-3.2-1B-FP8",
|
||||
optimization_level=2 # This is the default
|
||||
)
|
||||
```
|
||||
|
||||
#### `-O3`: Full Optimization
|
||||
Still in development. Added infrastructure to prevent changing API in future
|
||||
release. Currently behaves the same O2.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **Startup Time Too Long**: Use `-O0` or `-O1` for faster startup
|
||||
2. **Compilation Errors**: Use `debug_dump_path` for additional debugging information
|
||||
3. **Performance Issues**: Ensure using `-O2` for production
|
||||
@ -29,6 +29,109 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all
|
||||
|
||||
By default, the cache saves compiled artifacts as binary files. If you would like to interact with the generated code for debugging purposes, set the field `compile_cache_save_format=unpacked` in the compilation config, or omit this and set the env variable `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`.
|
||||
|
||||
## Dynamic shapes and vllm guard dropping
|
||||
|
||||
`torch.compile` is designed to guard on dynamic shapes with no hesitation
|
||||
when needed. This contradicts with vLLM's `torch.compile` approach of
|
||||
dropping the guards since many of those guards could be material.
|
||||
|
||||
`torch.compile` provides two kinds of dynamic shapes: `backed` and `unbacked`.
|
||||
`torch.compile` guards on `backed` dynamic shapes and does not provide a
|
||||
guarantee that no guards will be added to them. User code, dynamo,
|
||||
inductor, and autograd all can add guards. Moreover, for 0/1
|
||||
specializations, backed symbols are specialized unconditionally to 0, 1,
|
||||
or >=2 even without encountering a branching on those ranges.
|
||||
|
||||
On the contrary, `unbacked` dynamic shapes are guaranteed not to be guarded
|
||||
on and are not 0/1 specialized. However, there is a possibility of
|
||||
throwing a data dependent error when a branch that requires their value is
|
||||
encountered and no explicit unbacked handling is defined. The framework is
|
||||
converging to a state where it won't throw DDE but rather pick general
|
||||
paths. One downside of using unbacked is missed optimization opportunities
|
||||
due to either perf bugs or picking general paths, also using a fixed
|
||||
non-example input-based hint (this will be fixed soon with override_hint
|
||||
API). An example of picking general paths is assuming input not contiguous
|
||||
in functions call contiguous() and reshape() when can't be symbolically proven
|
||||
with a change of introducing a clone.
|
||||
|
||||
`backed_size_oblivious` is a flag that enables treating backed symbols as
|
||||
unbacked wherever explicit handling for unbacked is defined. With this
|
||||
mode, 0/1 specializations are mostly avoided in framework code and the
|
||||
default 0/1 specialization does not happen. However, there is still no
|
||||
guarantee that torch.compile won't guard, especially due to user code or
|
||||
custom passes. `backed_size_oblivious` is experimental in PyTorch compile
|
||||
and could be deprecated. That said, it's a safer option to use than
|
||||
`backed` and the probability of reducing performance is lower than
|
||||
`unbacked`.
|
||||
|
||||
### Configuring Dynamic Shapes
|
||||
|
||||
The `DynamicShapesConfig` allows you to control the dynamic shapes behavior by
|
||||
setting the `type` field. You can choose between three modes:
|
||||
`BACKED`(default), `UNBACKED` , and `BACKED_SIZE_OBLIVIOUS`.
|
||||
|
||||
#### Offline Inference Example (Using LLM class)
|
||||
|
||||
When using the `LLM` class for offline inference, you can configure dynamic
|
||||
shapes through the `compilation_config` parameter:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
|
||||
|
||||
# Example: Using backed_size_oblivious (experimental, safer than backed)
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.2-1B",
|
||||
compilation_config=CompilationConfig(
|
||||
dynamic_shapes_config=DynamicShapesConfig(
|
||||
type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Example: Using unbacked (strongest guarantee against guards)
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.2-1B",
|
||||
compilation_config=CompilationConfig(
|
||||
dynamic_shapes_config=DynamicShapesConfig(
|
||||
type=DynamicShapesType.UNBACKED
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Generate outputs
|
||||
prompts = ["Hello, my name is", "The future of AI is"]
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
```
|
||||
|
||||
#### Online Serving Example (Using vllm serve)
|
||||
|
||||
When using `vllm serve` for online serving, you can configure dynamic shapes
|
||||
through the `--compilation-config` flag:
|
||||
|
||||
```bash
|
||||
# Example: Using unbacked
|
||||
vllm serve meta-llama/Llama-3.2-1B \
|
||||
--compilation-config '{"dynamic_shapes_config": {"type": "unbacked"}}'
|
||||
|
||||
|
||||
# Alternative: Using dot notation (simpler for single values)
|
||||
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
|
||||
```
|
||||
|
||||
#### Choosing the Right Mode
|
||||
|
||||
- **BACKED** (default): Use when you're willing to accept potential unsafe dropping of guards
|
||||
for maximal performance. Guard could be unsoundly added and then ignored.
|
||||
|
||||
- **UNBACKED** Use when you need the strongest guarantee against guards.
|
||||
This is the most conservative option but may miss some optimization opportunities.
|
||||
|
||||
- **BACKED_SIZE_OBLIVIOUS**: Use when you want a balance between avoiding guards
|
||||
and performance. This experimental mode is safer than BACKED but still not as
|
||||
conservative as UNBACKED.
|
||||
|
||||
## Python Code Compilation
|
||||
|
||||
In the very verbose logs, we can see:
|
||||
@ -122,7 +225,7 @@ When all the shapes are known, `torch.compile` can compare different configs, an
|
||||
triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
|
||||
triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
|
||||
triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
|
||||
mm 0.0160 ms 81.6%
|
||||
mm 0.0160 ms 81.6%
|
||||
triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
|
||||
triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
|
||||
triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
|
||||
|
||||
@ -22,9 +22,6 @@ export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxab
|
||||
vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
|
||||
```
|
||||
|
||||
!!! tip
|
||||
If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
|
||||
|
||||
!!! tip
|
||||
When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
|
||||
`VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.
|
||||
|
||||
@ -7,7 +7,7 @@ This document shows you some examples of the different options that are
|
||||
available to generate structured outputs.
|
||||
|
||||
!!! warning
|
||||
If you are still using the following deprecated API fields, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
|
||||
If you are still using the following deprecated API fields which were removed in v0.12.0, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
|
||||
|
||||
- `guided_json` -> `{"structured_outputs": {"json": ...}}` or `StructuredOutputsParams(json=...)`
|
||||
- `guided_regex` -> `{"structured_outputs": {"regex": ...}}` or `StructuredOutputsParams(regex=...)`
|
||||
|
||||
@ -283,7 +283,7 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
|
||||
|
||||
If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options:
|
||||
|
||||
- On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
|
||||
- On NVIDIA CUDA: `FLASH_ATTN` or `FLASHINFER`.
|
||||
- On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.
|
||||
|
||||
For AMD ROCm, you can further control the specific Attention implementation using the following variables:
|
||||
|
||||
@ -1,25 +1,33 @@
|
||||
# CPU - Intel® Xeon®
|
||||
|
||||
## Validated Hardware
|
||||
|
||||
| Hardware |
|
||||
| ----------------------------------------- |
|
||||
| [Intel® Xeon® 6 Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon.html) |
|
||||
| [Intel® Xeon® 5 Processors](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/5th-gen-xeon-scalable-processors.html) |
|
||||
|
||||
## Supported Models
|
||||
|
||||
### Text-only Language Models
|
||||
|
||||
| Model | Architecture | Supported |
|
||||
|--------------------------------------|-------------------------------------------|-----------|
|
||||
| meta-llama/Llama-3.1 / 3.3 | LlamaForCausalLM | ✅ |
|
||||
| meta-llama/Llama-4-Scout | Llama4ForConditionalGeneration | ✅ |
|
||||
| meta-llama/Llama-4-Maverick | Llama4ForConditionalGeneration | ✅ |
|
||||
| ibm-granite/granite (Granite-MOE) | GraniteMoeForCausalLM | ✅ |
|
||||
| Qwen/Qwen3 | Qwen3ForCausalLM | ✅ |
|
||||
| zai-org/GLM-4.5 | GLMForCausalLM | ✅ |
|
||||
| google/gemma | GemmaForCausalLM | ✅ |
|
||||
| meta-llama/Llama-3.1-8B-Instruct | LlamaForCausalLM | ✅ |
|
||||
| meta-llama/Llama-3.2-3B-Instruct | LlamaForCausalLM | ✅ |
|
||||
| ibm-granite/granite-3.2-2b-instruct | GraniteForCausalLM | ✅ |
|
||||
| Qwen/Qwen3-1.7B | Qwen3ForCausalLM | ✅ |
|
||||
| Qwen/Qwen3-4B | Qwen3ForCausalLM | ✅ |
|
||||
| Qwen/Qwen3-8B | Qwen3ForCausalLM | ✅ |
|
||||
| zai-org/glm-4-9b-hf | GLMForCausalLM | ✅ |
|
||||
| google/gemma-7b | GemmaForCausalLM | ✅ |
|
||||
|
||||
### Multimodal Language Models
|
||||
|
||||
| Model | Architecture | Supported |
|
||||
|--------------------------------------|-------------------------------------------|-----------|
|
||||
| Qwen/Qwen2.5-VL | Qwen2VLForConditionalGeneration | ✅ |
|
||||
| openai/whisper | WhisperForConditionalGeneration | ✅ |
|
||||
| Qwen/Qwen2.5-VL-7B-Instruct | Qwen2VLForConditionalGeneration | ✅ |
|
||||
| openai/whisper-large-v3 | WhisperForConditionalGeneration | ✅ |
|
||||
|
||||
✅ Runs and optimized.
|
||||
🟨 Runs and correct but not optimized to green yet.
|
||||
|
||||
65
docs/models/hardware_supported_models/xpu.md
Normal file
65
docs/models/hardware_supported_models/xpu.md
Normal file
@ -0,0 +1,65 @@
|
||||
# XPU - Intel® GPUs
|
||||
|
||||
## Validated Hardware
|
||||
|
||||
| Hardware |
|
||||
| ----------------------------------------- |
|
||||
| [Intel® Arc™ Pro B-Series Graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/workstations/b-series/overview.html) |
|
||||
|
||||
## Supported Models
|
||||
|
||||
### Text-only Language Models
|
||||
|
||||
| Model | Architecture | FP16 | Dynamic FP8 | MXFP4 |
|
||||
| ----------------------------------------- | ---------------------------------------------------- | ---- | ----------- | ----- |
|
||||
| openai/gpt-oss-20b | GPTForCausalLM | | | ✅ |
|
||||
| openai/gpt-oss-120b | GPTForCausalLM | | | ✅ |
|
||||
| deepseek-ai/DeepSeek-R1-Distill-Llama-8B | LlamaForCausalLM | ✅ | ✅ | |
|
||||
| deepseek-ai/DeepSeek-R1-Distill-Qwen-14B | QwenForCausalLM | ✅ | ✅ | |
|
||||
| deepseek-ai/DeepSeek-R1-Distill-Qwen-32B | QwenForCausalLM | ✅ | ✅ | |
|
||||
| deepseek-ai/DeepSeek-R1-Distill-Llama-70B | LlamaForCausalLM | ✅ | ✅ | |
|
||||
| Qwen/Qwen2.5-72B-Instruct | Qwen2ForCausalLM | ✅ | ✅ | |
|
||||
| Qwen/Qwen3-14B | Qwen3ForCausalLM | ✅ | ✅ | |
|
||||
| Qwen/Qwen3-32B | Qwen3ForCausalLM | ✅ | ✅ | |
|
||||
| Qwen/Qwen3-30B-A3B | Qwen3ForCausalLM | ✅ | ✅ | |
|
||||
| Qwen/Qwen3-30B-A3B-GPTQ-Int4 | Qwen3ForCausalLM | ✅ | ✅ | |
|
||||
| Qwen/Qwen3-coder-30B-A3B-Instruct | Qwen3ForCausalLM | ✅ | ✅ | |
|
||||
| Qwen/QwQ-32B | QwenForCausalLM | ✅ | ✅ | |
|
||||
| deepseek-ai/DeepSeek-V2-Lite | DeepSeekForCausalLM | ✅ | ✅ | |
|
||||
| meta-llama/Llama-3.1-8B-Instruct | LlamaForCausalLM | ✅ | ✅ | |
|
||||
| baichuan-inc/Baichuan2-13B-Chat | BaichuanForCausalLM | ✅ | ✅ | |
|
||||
| THUDM/GLM-4-9B-chat | GLMForCausalLM | ✅ | ✅ | |
|
||||
| THUDM/CodeGeex4-All-9B | CodeGeexForCausalLM | ✅ | ✅ | |
|
||||
| chuhac/TeleChat2-35B | LlamaForCausalLM (TeleChat2 based on Llama arch) | ✅ | ✅ | |
|
||||
| 01-ai/Yi1.5-34B-Chat | YiForCausalLM | ✅ | ✅ | |
|
||||
| THUDM/CodeGeex4-All-9B | CodeGeexForCausalLM | ✅ | ✅ | |
|
||||
| deepseek-ai/DeepSeek-Coder-33B-base | DeepSeekCoderForCausalLM | ✅ | ✅ | |
|
||||
| baichuan-inc/Baichuan2-13B-Chat | BaichuanForCausalLM | ✅ | ✅ | |
|
||||
| meta-llama/Llama-2-13b-chat-hf | LlamaForCausalLM | ✅ | ✅ | |
|
||||
| THUDM/CodeGeex4-All-9B | CodeGeexForCausalLM | ✅ | ✅ | |
|
||||
| Qwen/Qwen1.5-14B-Chat | QwenForCausalLM | ✅ | ✅ | |
|
||||
| Qwen/Qwen1.5-32B-Chat | QwenForCausalLM | ✅ | ✅ | |
|
||||
|
||||
### Multimodal Language Models
|
||||
|
||||
| Model | Architecture | FP16 | Dynamic FP8 | MXFP4 |
|
||||
| ---------------------------- | -------------------------------- | ---- | ----------- | ----- |
|
||||
| OpenGVLab/InternVL3_5-8B | InternVLForConditionalGeneration | ✅ | ✅ | |
|
||||
| OpenGVLab/InternVL3_5-14B | InternVLForConditionalGeneration | ✅ | ✅ | |
|
||||
| OpenGVLab/InternVL3_5-38B | InternVLForConditionalGeneration | ✅ | ✅ | |
|
||||
| Qwen/Qwen2-VL-7B-Instruct | Qwen2VLForConditionalGeneration | ✅ | ✅ | |
|
||||
| Qwen/Qwen2.5-VL-72B-Instruct | Qwen2VLForConditionalGeneration | ✅ | ✅ | |
|
||||
| Qwen/Qwen2.5-VL-32B-Instruct | Qwen2VLForConditionalGeneration | ✅ | ✅ | |
|
||||
| THUDM/GLM-4v-9B | GLM4vForConditionalGeneration | ✅ | ✅ | |
|
||||
| openbmb/MiniCPM-V-4 | MiniCPMVForConditionalGeneration | ✅ | ✅ | |
|
||||
|
||||
### Embedding and Reranker Language Models
|
||||
|
||||
| Model | Architecture | FP16 | Dynamic FP8 | MXFP4 |
|
||||
| ----------------------- | ------------------------------ | ---- | ----------- | ----- |
|
||||
| Qwen/Qwen3-Embedding-8B | Qwen3ForTextEmbedding | ✅ | ✅ | |
|
||||
| Qwen/Qwen3-Reranker-8B | Qwen3ForSequenceClassification | ✅ | ✅ | |
|
||||
|
||||
✅ Runs and optimized.
|
||||
🟨 Runs and correct but not optimized to green yet.
|
||||
❌ Does not pass accuracy test or does not run.
|
||||
@ -1,15 +1,15 @@
|
||||
# Pooling Models
|
||||
|
||||
vLLM also supports pooling models, such as embedding, classification and reward models.
|
||||
vLLM also supports pooling models, such as embedding, classification, and reward models.
|
||||
|
||||
In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
|
||||
These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input
|
||||
before returning them.
|
||||
|
||||
!!! note
|
||||
We currently support pooling models primarily as a matter of convenience. This is not guaranteed to have any performance improvement over using HF Transformers / Sentence Transformers directly.
|
||||
We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly.
|
||||
|
||||
We are now planning to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
|
||||
We plan to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
|
||||
|
||||
## Configuration
|
||||
|
||||
@ -19,7 +19,7 @@ Run a model in pooling mode via the option `--runner pooling`.
|
||||
|
||||
!!! tip
|
||||
There is no need to set this option in the vast majority of cases as vLLM can automatically
|
||||
detect the model runner to use via `--runner auto`.
|
||||
detect the appropriate model runner via `--runner auto`.
|
||||
|
||||
### Model Conversion
|
||||
|
||||
@ -78,7 +78,7 @@ When loading [Sentence Transformers](https://huggingface.co/sentence-transformer
|
||||
its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
|
||||
|
||||
You can further customize this via the `--pooler-config` option,
|
||||
which takes priority over both the model's and Sentence Transformers's defaults.
|
||||
which takes priority over both the model's and Sentence Transformers' defaults.
|
||||
|
||||
## Offline Inference
|
||||
|
||||
@ -168,11 +168,11 @@ The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
|
||||
|
||||
- For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`.
|
||||
- For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`.
|
||||
- For similarity scores, use `LLM.score(...)`.
|
||||
- For similarity scores, use `LLM.score(...)`.
|
||||
- For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`.
|
||||
- For token classification, use `pooling_task="token_classify"`.
|
||||
- For multi-vector retrieval, use `pooling_task="token_embed"`
|
||||
- For IO Processor Plugins , use `pooling_task="plugin"`
|
||||
- For multi-vector retrieval, use `pooling_task="token_embed"`.
|
||||
- For IO Processor Plugins, use `pooling_task="plugin"`.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
@ -194,15 +194,15 @@ Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides
|
||||
- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
|
||||
|
||||
!!! note
|
||||
Please use one of the more specific methods or set the task directly when using [Pooling API](../serving/openai_compatible_server.md#pooling-api) api.:
|
||||
Please use one of the more specific endpoints or set the task directly when using the [Pooling API](../serving/openai_compatible_server.md#pooling-api):
|
||||
|
||||
- For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`.
|
||||
- For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `task":"classify"`.
|
||||
- For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).
|
||||
- For rewards, `task":"token_classify"`.
|
||||
- For token classification, use `task":"token_classify"`.
|
||||
- For multi-vector retrieval, use `task":"token_embed"`
|
||||
- For IO Processor Plugins , use `task":"plugin"`
|
||||
- For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `"task":"classify"`.
|
||||
- For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).
|
||||
- For rewards, use `"task":"token_classify"`.
|
||||
- For token classification, use `"task":"token_classify"`.
|
||||
- For multi-vector retrieval, use `"task":"token_embed"`.
|
||||
- For IO Processor Plugins, use `"task":"plugin"`.
|
||||
|
||||
```python
|
||||
# start a supported embeddings model server with `vllm serve`, e.g.
|
||||
@ -232,7 +232,7 @@ for output in response.json()["data"]:
|
||||
|
||||
## Matryoshka Embeddings
|
||||
|
||||
[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost.
|
||||
[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows users to trade off between performance and cost.
|
||||
|
||||
!!! warning
|
||||
Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
|
||||
@ -245,9 +245,9 @@ for output in response.json()["data"]:
|
||||
|
||||
### Manually enable Matryoshka Embeddings
|
||||
|
||||
There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions.
|
||||
There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json`, you can change the output dimension to arbitrary values. Use `matryoshka_dimensions` to control the allowed output dimensions.
|
||||
|
||||
For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf-overrides '{"is_matryoshka": true}'`, `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
|
||||
For models that support Matryoshka Embeddings but are not recognized by vLLM, manually override the config using `hf_overrides={"is_matryoshka": True}` or `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline), or `--hf-overrides '{"is_matryoshka": true}'` or `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'` (online).
|
||||
|
||||
Here is an example to serve a model with Matryoshka Embeddings enabled.
|
||||
|
||||
@ -278,7 +278,7 @@ A code example can be found here: [examples/offline_inference/pooling/embed_matr
|
||||
|
||||
### Online Inference
|
||||
|
||||
Use the following command to start vllm server.
|
||||
Use the following command to start the vLLM server.
|
||||
|
||||
```bash
|
||||
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
|
||||
@ -310,11 +310,11 @@ An OpenAI client example can be found here: [examples/online_serving/pooling/ope
|
||||
|
||||
### Encode task
|
||||
|
||||
We have split the `encode` task into two more specific token wise tasks: `token_embed` and `token_classify`:
|
||||
We have split the `encode` task into two more specific token-wise tasks: `token_embed` and `token_classify`:
|
||||
|
||||
- `token_embed` is the same as embed, using normalize as activation.
|
||||
- `token_classify` is the same as classify, default using softmax as activation.
|
||||
- `token_embed` is the same as `embed`, using normalization as the activation.
|
||||
- `token_classify` is the same as `classify`, by default using softmax as the activation.
|
||||
|
||||
### Remove softmax from PoolingParams
|
||||
|
||||
We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function.
|
||||
We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
|
||||
|
||||
@ -680,6 +680,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ |
|
||||
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
|
||||
| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
|
||||
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
|
||||
|
||||
@ -49,7 +49,8 @@ We currently support the following OpenAI APIs:
|
||||
- *Note: `suffix` parameter is not supported.*
|
||||
- [Chat Completions API](#chat-api) (`/v1/chat/completions`)
|
||||
- Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
|
||||
- *Note: `parallel_tool_calls` and `user` parameters are ignored.*
|
||||
- *Note: `user` parameter is ignored.*
|
||||
- *Note:* Setting the `parallel_tool_calls` parameter to `false` ensures vLLM only returns zero or one tool call per request. Setting it to `true` (the default) allows returning more than one tool call per request. There is no guarantee more than one tool call will be returned if this is set to `true`, as that behavior is model dependent and not all models are designed to support parallel tool calls.
|
||||
- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
|
||||
- Only applicable to [embedding models](../models/pooling_models.md).
|
||||
- [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
|
||||
|
||||
15
examples/offline_inference/audio_language.py
Normal file → Executable file
15
examples/offline_inference/audio_language.py
Normal file → Executable file
@ -425,6 +425,13 @@ def parse_args():
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tensor-parallel-size",
|
||||
"-tp",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Tensor parallel size to override the model's default setting. ",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
@ -434,6 +441,12 @@ def main(args):
|
||||
if model not in model_example_map:
|
||||
raise ValueError(f"Model type {model} is not supported.")
|
||||
|
||||
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
|
||||
raise ValueError(
|
||||
f"tensor_parallel_size must be a positive integer, "
|
||||
f"got {args.tensor_parallel_size}"
|
||||
)
|
||||
|
||||
audio_count = args.num_audios
|
||||
req_data = model_example_map[model](
|
||||
question_per_audio_count[audio_count], audio_count
|
||||
@ -446,6 +459,8 @@ def main(args):
|
||||
)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
if args.tensor_parallel_size is not None:
|
||||
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
# We set temperature to 0.2 so that outputs can be different
|
||||
|
||||
170
examples/offline_inference/qwen3_omni/only_thinker.py
Normal file
170
examples/offline_inference/qwen3_omni/only_thinker.py
Normal file
@ -0,0 +1,170 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This example shows how to use vLLM for running offline inference
|
||||
with the correct prompt format on Qwen2.5-Omni (thinker only).
|
||||
"""
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class QueryResult(NamedTuple):
|
||||
inputs: dict
|
||||
limit_mm_per_prompt: dict[str, int]
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
# lower-end GPUs.
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
default_system = (
|
||||
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
|
||||
"Group, capable of perceiving auditory and visual inputs, as well as "
|
||||
"generating text and speech."
|
||||
)
|
||||
|
||||
|
||||
def get_mixed_modalities_query() -> QueryResult:
|
||||
question = (
|
||||
"What is recited in the audio? "
|
||||
"What is the content of this image? Why is this video funny?"
|
||||
)
|
||||
prompt = (
|
||||
f"<|im_start|>system\n{default_system}<|im_end|>\n"
|
||||
"<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
|
||||
"<|vision_start|><|image_pad|><|vision_end|>"
|
||||
"<|vision_start|><|video_pad|><|vision_end|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
f"<|im_start|>assistant\n"
|
||||
)
|
||||
return QueryResult(
|
||||
inputs={
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
|
||||
"image": convert_image_mode(
|
||||
ImageAsset("cherry_blossom").pil_image, "RGB"
|
||||
),
|
||||
"video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
|
||||
},
|
||||
},
|
||||
limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1},
|
||||
)
|
||||
|
||||
|
||||
def get_use_audio_in_video_query() -> QueryResult:
|
||||
question = (
|
||||
"Describe the content of the video in details, then convert what the "
|
||||
"baby say into text."
|
||||
)
|
||||
prompt = (
|
||||
f"<|im_start|>system\n{default_system}<|im_end|>\n"
|
||||
"<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
f"<|im_start|>assistant\n"
|
||||
)
|
||||
asset = VideoAsset(name="baby_reading", num_frames=16)
|
||||
audio = asset.get_audio(sampling_rate=16000)
|
||||
return QueryResult(
|
||||
inputs={
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
"video": asset.np_ndarrays,
|
||||
"audio": audio,
|
||||
},
|
||||
"mm_processor_kwargs": {
|
||||
"use_audio_in_video": True,
|
||||
},
|
||||
},
|
||||
limit_mm_per_prompt={"audio": 1, "video": 1},
|
||||
)
|
||||
|
||||
|
||||
def get_multi_audios_query() -> QueryResult:
|
||||
question = "Are these two audio clips the same?"
|
||||
prompt = (
|
||||
f"<|im_start|>system\n{default_system}<|im_end|>\n"
|
||||
"<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
|
||||
"<|audio_start|><|audio_pad|><|audio_end|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
f"<|im_start|>assistant\n"
|
||||
)
|
||||
return QueryResult(
|
||||
inputs={
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
"audio": [
|
||||
AudioAsset("winning_call").audio_and_sample_rate,
|
||||
AudioAsset("mary_had_lamb").audio_and_sample_rate,
|
||||
],
|
||||
},
|
||||
},
|
||||
limit_mm_per_prompt={
|
||||
"audio": 2,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
query_map = {
|
||||
"mixed_modalities": get_mixed_modalities_query,
|
||||
"use_audio_in_video": get_use_audio_in_video_query,
|
||||
"multi_audios": get_multi_audios_query,
|
||||
}
|
||||
|
||||
|
||||
def main(args):
|
||||
model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
|
||||
query_result = query_map[args.query_type]()
|
||||
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=12800,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt=query_result.limit_mm_per_prompt,
|
||||
seed=args.seed,
|
||||
)
|
||||
|
||||
# We set temperature to 0.2 so that outputs can be different
|
||||
# even when all prompts are identical when running batch inference.
|
||||
sampling_params = SamplingParams(temperature=0.2, max_tokens=256)
|
||||
|
||||
outputs = llm.generate(query_result.inputs, sampling_params=sampling_params)
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Demo on using vLLM for offline inference with "
|
||||
"audio language models"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--query-type",
|
||||
"-q",
|
||||
type=str,
|
||||
default="mixed_modalities",
|
||||
choices=query_map.keys(),
|
||||
help="Query type.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@ -133,7 +133,7 @@ def main(args):
|
||||
tensor_parallel_size=args.tp,
|
||||
enable_chunked_prefill=args.enable_chunked_prefill,
|
||||
enforce_eager=args.enforce_eager,
|
||||
gpu_memory_utilization=0.8,
|
||||
gpu_memory_utilization=0.9,
|
||||
speculative_config=speculative_config,
|
||||
disable_log_stats=False,
|
||||
max_model_len=args.max_model_len,
|
||||
|
||||
41
examples/offline_inference/vision_language.py
Normal file → Executable file
41
examples/offline_inference/vision_language.py
Normal file → Executable file
@ -538,6 +538,31 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# HunyuanOCR
|
||||
def run_hunyuan_vl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "tencent/HunyuanOCR"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
|
||||
placeholder = "<|hy_place▁holder▁no▁100|><|hy_place▁holder▁no▁102|><|hy_place▁holder▁no▁101|>" # noqa: E501
|
||||
prompts = [
|
||||
f"<|hy_begin▁of▁sentence|>{placeholder}{question}<|hy_User|>"
|
||||
for question in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
stop_token_ids=None,
|
||||
)
|
||||
|
||||
|
||||
# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
|
||||
def run_hyperclovax_seed_vision(
|
||||
questions: list[str], modality: str
|
||||
@ -1820,6 +1845,7 @@ model_example_map = {
|
||||
"glm4_5v": run_glm4_5v,
|
||||
"glm4_5v_fp8": run_glm4_5v_fp8,
|
||||
"h2ovl_chat": run_h2ovl,
|
||||
"hunyuan_vl": run_hunyuan_vl,
|
||||
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
|
||||
"idefics3": run_idefics3,
|
||||
"interns1": run_interns1,
|
||||
@ -2038,6 +2064,13 @@ def parse_args():
|
||||
help="If True, will send all requests in a second batch with empty mm "
|
||||
"data to verify cache hits with UUIDs.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tensor-parallel-size",
|
||||
"-tp",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Tensor parallel size to override the model's default setting. ",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@ -2046,6 +2079,12 @@ def main(args):
|
||||
if model not in model_example_map:
|
||||
raise ValueError(f"Model type {model} is not supported.")
|
||||
|
||||
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
|
||||
raise ValueError(
|
||||
f"tensor_parallel_size must be a positive integer, "
|
||||
f"got {args.tensor_parallel_size}"
|
||||
)
|
||||
|
||||
modality = args.modality
|
||||
mm_input = get_multi_modal_input(args)
|
||||
data = mm_input["data"]
|
||||
@ -2063,6 +2102,8 @@ def main(args):
|
||||
"seed": args.seed,
|
||||
"mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
|
||||
}
|
||||
if args.tensor_parallel_size is not None:
|
||||
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
# Don't want to check the flag multiple times, so just hijack `prompts`.
|
||||
|
||||
41
examples/offline_inference/vision_language_multi_image.py
Normal file → Executable file
41
examples/offline_inference/vision_language_multi_image.py
Normal file → Executable file
@ -1110,6 +1110,7 @@ def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model=model_name,
|
||||
max_model_len=16384,
|
||||
max_num_seqs=16,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
@ -1351,10 +1352,18 @@ model_example_map = {
|
||||
}
|
||||
|
||||
|
||||
def run_generate(model, question: str, image_urls: list[str], seed: int | None):
|
||||
def run_generate(
|
||||
model,
|
||||
question: str,
|
||||
image_urls: list[str],
|
||||
seed: int | None,
|
||||
tensor_parallel_size: int | None,
|
||||
):
|
||||
req_data = model_example_map[model](question, image_urls)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||
if tensor_parallel_size is not None:
|
||||
engine_args["tensor_parallel_size"] = tensor_parallel_size
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
@ -1377,7 +1386,13 @@ def run_generate(model, question: str, image_urls: list[str], seed: int | None):
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
|
||||
def run_chat(
|
||||
model: str,
|
||||
question: str,
|
||||
image_urls: list[str],
|
||||
seed: int | None,
|
||||
tensor_parallel_size: int | None,
|
||||
):
|
||||
req_data = model_example_map[model](question, image_urls)
|
||||
|
||||
# Disable other modalities to save memory
|
||||
@ -1387,6 +1402,8 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None)
|
||||
)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||
if tensor_parallel_size is not None:
|
||||
engine_args["tensor_parallel_size"] = tensor_parallel_size
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
sampling_params = (
|
||||
@ -1462,6 +1479,13 @@ def parse_args():
|
||||
default=2,
|
||||
help="Number of images to use for the demo.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tensor-parallel-size",
|
||||
"-tp",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Tensor parallel size to override the model's default setting. ",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@ -1469,13 +1493,20 @@ def main(args: Namespace):
|
||||
model = args.model_type
|
||||
method = args.method
|
||||
seed = args.seed
|
||||
tensor_parallel_size = args.tensor_parallel_size
|
||||
|
||||
if tensor_parallel_size is not None and tensor_parallel_size < 1:
|
||||
raise ValueError(
|
||||
f"tensor_parallel_size must be a positive integer, "
|
||||
f"got {tensor_parallel_size}"
|
||||
)
|
||||
|
||||
image_urls = IMAGE_URLS[: args.num_images]
|
||||
|
||||
if method == "generate":
|
||||
run_generate(model, QUESTION, image_urls, seed)
|
||||
run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size)
|
||||
elif method == "chat":
|
||||
run_chat(model, QUESTION, image_urls, seed)
|
||||
run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size)
|
||||
else:
|
||||
raise ValueError(f"Invalid method: {method}")
|
||||
|
||||
|
||||
@ -22,7 +22,6 @@ API_KEY=${API_KEY:-"your-api-key"}
|
||||
POOLING_TYPE=${POOLING_TYPE:-"auto"} # auto, MEAN, CLS, LAST
|
||||
export VLLM_ENABLE_CHUNKED_PROCESSING=true
|
||||
export CUDA_VISIBLE_DEVICES=2,3,4,5
|
||||
# export VLLM_ATTENTION_BACKEND=XFORMERS
|
||||
|
||||
echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing"
|
||||
echo "=================================================================="
|
||||
|
||||
@ -4,9 +4,9 @@ packaging>=24.2
|
||||
setuptools>=77.0.3,<81.0.0
|
||||
setuptools-scm>=8
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
|
||||
torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
|
||||
torch==2.9.0; platform_system == "Darwin"
|
||||
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||
torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||
scons; platform_machine == "aarch64" # needed to build Arm Compute Library (ACL)
|
||||
wheel
|
||||
jinja2>=3.1.6
|
||||
|
||||
@ -7,17 +7,17 @@ numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative d
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<81.0.0
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
|
||||
torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
|
||||
torch==2.9.0; platform_system == "Darwin"
|
||||
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||
torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||
|
||||
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
|
||||
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
|
||||
torchaudio==2.8.0; platform_machine == "ppc64le"
|
||||
torchaudio==2.9.0; platform_machine == "ppc64le"
|
||||
|
||||
# required for the image processor of phi3v, this must be updated alongside torch
|
||||
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
|
||||
torchvision==0.23.0; platform_machine == "ppc64le"
|
||||
torchvision==0.24.0; platform_machine == "ppc64le"
|
||||
datasets # for benchmark scripts
|
||||
|
||||
# Intel Extension for PyTorch, only for x86_64 CPUs
|
||||
|
||||
@ -9,6 +9,5 @@ torch==2.9.0
|
||||
torchaudio==2.9.0
|
||||
# These must be updated alongside torch
|
||||
torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||
xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9
|
||||
# FlashInfer should be updated together with the Dockerfile
|
||||
flashinfer-python==0.5.2
|
||||
flashinfer-python==0.5.3
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
lmcache
|
||||
nixl >= 0.6.0 # Required for disaggregated prefill
|
||||
nixl >= 0.7.1 # Required for disaggregated prefill
|
||||
|
||||
@ -29,7 +29,7 @@ opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
|
||||
mteb>=1.38.11, <2 # required for mteb test
|
||||
transformers==4.57.1
|
||||
transformers==4.57.3
|
||||
tokenizers==0.22.0
|
||||
schemathesis>=3.39.15 # Required for openai schema test.
|
||||
# quantization
|
||||
|
||||
@ -45,3 +45,7 @@ multiprocess==0.70.16
|
||||
|
||||
# Plugins test
|
||||
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
|
||||
torchgeo==0.7.0
|
||||
|
||||
# Required for suffix decoding test
|
||||
arctic-inference == 0.1.1
|
||||
|
||||
@ -37,7 +37,7 @@ datamodel_code_generator # required for minicpm3 test
|
||||
# TODO: Use lm-eval[api]==0.4.10 once released
|
||||
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
|
||||
mteb[bm25s]>=2, <3 # required for mteb test
|
||||
transformers==4.57.1
|
||||
transformers==4.57.3
|
||||
tokenizers==0.22.0
|
||||
schemathesis>=3.39.15 # Required for openai schema test.
|
||||
# quantization
|
||||
|
||||
@ -1196,7 +1196,7 @@ tqdm==4.66.6
|
||||
# transformers
|
||||
tqdm-multiprocess==0.0.11
|
||||
# via lm-eval
|
||||
transformers==4.57.1
|
||||
transformers==4.57.3
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# genai-perf
|
||||
|
||||
@ -12,6 +12,4 @@ ray[data]
|
||||
setuptools==78.1.0
|
||||
nixl==0.3.0
|
||||
tpu_info==0.4.0
|
||||
|
||||
# Install torch_xla
|
||||
torch_xla[tpu, pallas]==2.8.0
|
||||
tpu-inference==0.11.1
|
||||
|
||||
@ -10,9 +10,9 @@ wheel
|
||||
jinja2>=3.1.6
|
||||
datasets # for benchmark scripts
|
||||
numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
torch==2.8.0+xpu
|
||||
--extra-index-url=https://download.pytorch.org/whl/xpu
|
||||
torch==2.9.0+xpu
|
||||
torchaudio
|
||||
torchvision
|
||||
--extra-index-url=https://download.pytorch.org/whl/xpu
|
||||
|
||||
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
|
||||
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.9.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
|
||||
|
||||
@ -74,9 +74,6 @@ def test_models(
|
||||
model_executor: str,
|
||||
enable_prompt_embeds: bool,
|
||||
) -> None:
|
||||
if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
|
||||
pytest.skip(f"{backend} does not support gemma2 with full context length.")
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", backend)
|
||||
|
||||
|
||||
@ -55,7 +55,7 @@ class SillyModel(nn.Module):
|
||||
def _run_simple_model(
|
||||
splitting_ops,
|
||||
use_inductor_graph_partition,
|
||||
use_inductor,
|
||||
backend,
|
||||
expected_num_piecewise_graphs_seen,
|
||||
expected_num_piecewise_capturable_graphs_seen,
|
||||
expected_num_backend_compilations,
|
||||
@ -64,7 +64,7 @@ def _run_simple_model(
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
use_inductor=use_inductor,
|
||||
backend=backend,
|
||||
splitting_ops=splitting_ops,
|
||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||
cudagraph_copy_inputs=True,
|
||||
@ -124,14 +124,14 @@ def _run_simple_model(
|
||||
assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_inductor", [True, False])
|
||||
@pytest.mark.parametrize("backend", ["inductor", "eager"])
|
||||
@torch.inference_mode()
|
||||
@create_new_process_for_each_test("spawn")
|
||||
def test_simple_piecewise_compile(use_inductor):
|
||||
def test_simple_piecewise_compile(backend):
|
||||
_run_simple_model(
|
||||
splitting_ops=["silly::attention"],
|
||||
use_inductor_graph_partition=False,
|
||||
use_inductor=use_inductor,
|
||||
backend=backend,
|
||||
# 2 * num_layers + 1
|
||||
expected_num_piecewise_graphs_seen=5,
|
||||
# 1 + num_layers
|
||||
@ -155,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch):
|
||||
_run_simple_model(
|
||||
splitting_ops=["silly::attention"],
|
||||
use_inductor_graph_partition=True,
|
||||
use_inductor=True,
|
||||
backend="inductor",
|
||||
# Since not splitting at fx graph level
|
||||
expected_num_piecewise_graphs_seen=1,
|
||||
# Since not splitting at fx graph level
|
||||
|
||||
@ -172,8 +172,8 @@ def test_splitting_ops_dynamic():
|
||||
config = VllmConfig()
|
||||
# Default V1 config leaves cudagraph mode unset; splitting ops are only
|
||||
# populated when the engine decides to use piecewise compilation.
|
||||
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
|
||||
assert not config.compilation_config.splitting_ops_contain_attention()
|
||||
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
assert config.compilation_config.splitting_ops_contain_attention()
|
||||
|
||||
# When use_inductor_graph_partition=True
|
||||
config = VllmConfig(
|
||||
|
||||
88
tests/compile/test_dynamic_shapes_compilation.py
Normal file
88
tests/compile/test_dynamic_shapes_compilation.py
Normal file
@ -0,0 +1,88 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import gc
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config.compilation import CompilationMode, DynamicShapesType
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
|
||||
def get_test_models():
|
||||
"""Get list of models to test based on PyTorch version"""
|
||||
# TODO "Qwen/Qwen3-4B-Instruct-2507" fails Fix issue and support it.
|
||||
return ["gpt2", "Qwen/Qwen2-7B-Instruct", "meta-llama/Llama-3.1-8B"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", get_test_models())
|
||||
@pytest.mark.parametrize(
|
||||
"shapes_type",
|
||||
[
|
||||
DynamicShapesType.BACKED,
|
||||
DynamicShapesType.UNBACKED,
|
||||
DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("use_aot_compile", ["0"])
|
||||
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
)
|
||||
def test_dynamic_shapes_compilation(
|
||||
monkeypatch, model_name, shapes_type, use_aot_compile, use_bytecode_hook
|
||||
):
|
||||
"""Test that all dynamic shapes types compile successfully"""
|
||||
print(
|
||||
f"\nTesting model: {model_name} with {shapes_type.name}, "
|
||||
f"AOT compile: {use_aot_compile}, "
|
||||
f"Bytecode hook: {use_bytecode_hook}"
|
||||
)
|
||||
if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
|
||||
pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
|
||||
|
||||
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
|
||||
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
|
||||
|
||||
prompt = "Hello, my name is"
|
||||
|
||||
print(f"Testing {shapes_type.name} dynamic shapes...")
|
||||
|
||||
# Initialize the model with specific dynamic shapes configuration
|
||||
model = LLM(
|
||||
model=model_name,
|
||||
compilation_config={
|
||||
"mode": CompilationMode.VLLM_COMPILE,
|
||||
"dynamic_shapes_config": {
|
||||
"type": shapes_type.value,
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
output = model.generate(prompt)
|
||||
result = output[0].outputs[0].text
|
||||
# Example of setting the sampling parameters
|
||||
tokenizer = get_tokenizer(model_name)
|
||||
yes_tokens = tokenizer.encode("yes", add_special_tokens=False)
|
||||
no_tokens = tokenizer.encode("no", add_special_tokens=False)
|
||||
allowed_ids = list(set(yes_tokens + no_tokens))
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=1, temperature=0, allowed_token_ids=allowed_ids
|
||||
)
|
||||
|
||||
output = model.generate(
|
||||
"answer with yes or no is " + result + " rubbish for prompt " + prompt + "?",
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
result = output[0].outputs[0].text
|
||||
assert result == "yes"
|
||||
|
||||
# Clean up GPU memory
|
||||
del model
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.synchronize()
|
||||
print("GPU memory cleared")
|
||||
@ -9,8 +9,9 @@ from tests.compile.backend import LazyInitPass, TestBackend
|
||||
from tests.utils import flat_product
|
||||
from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
|
||||
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
||||
from vllm.attention import Attention, AttentionMetadata
|
||||
from vllm.attention.backends.abstract import AttentionMetadata
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.attention.selector import global_force_attn_backend_context_manager
|
||||
from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
|
||||
from vllm.compilation.fx_utils import find_op_nodes
|
||||
|
||||
@ -5,7 +5,8 @@ import pytest
|
||||
import torch
|
||||
|
||||
from tests.compile.backend import TestBackend
|
||||
from vllm.attention import Attention, AttentionType
|
||||
from vllm.attention.backends.abstract import AttentionType
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
|
||||
from vllm.compilation.noop_elimination import NoOpEliminationPass
|
||||
from vllm.compilation.post_cleanup import PostCleanupPass
|
||||
|
||||
@ -1,13 +1,18 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
|
||||
from vllm.distributed.eplb.rebalance_execute import (
|
||||
move_from_buffer,
|
||||
rearrange_expert_weights_inplace,
|
||||
transfer_layer,
|
||||
)
|
||||
from vllm.distributed.parallel_state import (
|
||||
ensure_model_parallel_initialized,
|
||||
get_tp_group,
|
||||
@ -231,6 +236,100 @@ def verify_redundant_experts_have_same_weights(
|
||||
)
|
||||
|
||||
|
||||
def _test_async_transfer_layer_without_mtp_worker(
|
||||
env,
|
||||
world_size: int,
|
||||
num_layers: int,
|
||||
num_local_experts: int,
|
||||
num_logical_experts: int,
|
||||
) -> None:
|
||||
set_env_vars_and_device(env)
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
tp_group = get_tp_group()
|
||||
ep_group = tp_group.device_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
device = torch.device(f"cuda:{ep_rank}")
|
||||
|
||||
total_physical_experts = world_size * num_local_experts
|
||||
hidden_sizes = [16, 32]
|
||||
|
||||
redundancy_config = create_redundancy_config(
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
)
|
||||
old_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
redundancy_config,
|
||||
)
|
||||
|
||||
new_redundancy_config = create_redundancy_config(
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
)
|
||||
new_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
new_redundancy_config,
|
||||
)
|
||||
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers,
|
||||
num_local_experts,
|
||||
hidden_sizes,
|
||||
ep_rank,
|
||||
device,
|
||||
old_indices,
|
||||
)
|
||||
|
||||
expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
|
||||
cuda_stream = torch.cuda.Stream(device=device)
|
||||
|
||||
for layer_idx in range(num_layers):
|
||||
is_unchanged, is_received_locally, experts_recv_loc = asyncio.run(
|
||||
transfer_layer(
|
||||
old_global_expert_indices=old_indices,
|
||||
new_global_expert_indices=new_indices,
|
||||
expert_weights=expert_weights,
|
||||
expert_weights_buffer=expert_buffer,
|
||||
ep_group=ep_group,
|
||||
layer=layer_idx,
|
||||
cuda_stream=cuda_stream,
|
||||
)
|
||||
)
|
||||
|
||||
cuda_stream.synchronize()
|
||||
move_from_buffer(
|
||||
expert_weights=expert_weights[layer_idx],
|
||||
expert_weights_buffer=expert_buffer,
|
||||
is_unchanged=is_unchanged,
|
||||
is_received_locally=is_received_locally,
|
||||
experts_recv_loc=experts_recv_loc,
|
||||
new_indices=new_indices[layer_idx].tolist(),
|
||||
ep_group=ep_group,
|
||||
)
|
||||
|
||||
verify_expert_weights_after_shuffle(
|
||||
expert_weights,
|
||||
new_indices,
|
||||
hidden_sizes,
|
||||
ep_rank,
|
||||
num_local_experts,
|
||||
)
|
||||
verify_redundant_experts_have_same_weights(
|
||||
expert_weights,
|
||||
new_indices,
|
||||
hidden_sizes,
|
||||
world_size,
|
||||
num_local_experts,
|
||||
)
|
||||
|
||||
|
||||
def _test_rearrange_expert_weights_with_redundancy(
|
||||
env, world_size, num_layers, num_local_experts, num_logical_experts
|
||||
) -> None:
|
||||
@ -399,6 +498,32 @@ def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"world_size,num_layers,num_local_experts,num_logical_experts",
|
||||
[
|
||||
(2, 2, 2, 3),
|
||||
],
|
||||
)
|
||||
def test_async_transfer_layer_without_mtp(
|
||||
world_size: int,
|
||||
num_layers: int,
|
||||
num_local_experts: int,
|
||||
num_logical_experts: int,
|
||||
):
|
||||
"""Exercise async EPLB transfer path without MTP/spec decode."""
|
||||
|
||||
if torch.cuda.device_count() < world_size:
|
||||
pytest.skip(f"Need at least {world_size} GPUs to run the test")
|
||||
|
||||
distributed_run(
|
||||
_test_async_transfer_layer_without_mtp_worker,
|
||||
world_size,
|
||||
num_layers,
|
||||
num_local_experts,
|
||||
num_logical_experts,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("world_size", [2, 4])
|
||||
def test_rearrange_expert_weights_no_change(world_size):
|
||||
"""
|
||||
|
||||
@ -10,10 +10,11 @@ from tests.utils import large_gpu_mark
|
||||
|
||||
def get_model_args(
|
||||
model_name: str,
|
||||
spec_model_name: str,
|
||||
spec_model_name: str | None,
|
||||
spec_method: str,
|
||||
tp_size: int,
|
||||
model_max_len: int,
|
||||
use_async: bool = False,
|
||||
) -> dict:
|
||||
speculative_config = {
|
||||
"method": spec_method,
|
||||
@ -37,6 +38,8 @@ def get_model_args(
|
||||
"enable_eplb": True,
|
||||
"max_model_len": model_max_len,
|
||||
}
|
||||
if use_async:
|
||||
model_args["eplb_config"] = {"use_async": True}
|
||||
return model_args
|
||||
|
||||
|
||||
@ -94,3 +97,37 @@ def test_eplb_spec_decode(
|
||||
measured_value - RTOL < expected_gsm8k_value
|
||||
and measured_value + RTOL > expected_gsm8k_value
|
||||
), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}"
|
||||
|
||||
|
||||
@large_gpu_mark(min_gb=80)
|
||||
def test_eplb_spec_decode_qwen3_next_mtp_async() -> None:
|
||||
"""
|
||||
Ensure async EPLB works with MTP speculative decoding for Qwen3-Next.
|
||||
"""
|
||||
|
||||
TASK = "gsm8k"
|
||||
FILTER = "exact_match,strict-match"
|
||||
RTOL = 0.03
|
||||
expected_gsm8k_value = 0.86
|
||||
|
||||
model_args = get_model_args(
|
||||
model_name="Qwen/Qwen3-Next-80B-A3B-Instruct",
|
||||
spec_model_name=None,
|
||||
spec_method="mtp",
|
||||
tp_size=4,
|
||||
model_max_len=4096,
|
||||
use_async=True,
|
||||
)
|
||||
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model_args=model_args,
|
||||
tasks=TASK,
|
||||
batch_size=64,
|
||||
num_fewshot=8,
|
||||
)
|
||||
measured_value = results["results"][TASK][FILTER]
|
||||
assert (
|
||||
measured_value - RTOL < expected_gsm8k_value
|
||||
and measured_value + RTOL > expected_gsm8k_value
|
||||
), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}"
|
||||
|
||||
@ -222,6 +222,47 @@ def test_media_io_kwargs_parser(arg, expected):
|
||||
assert args.media_io_kwargs == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("args", "expected"),
|
||||
[
|
||||
(["-O", "1"], "1"),
|
||||
(["-O", "2"], "2"),
|
||||
(["-O", "3"], "3"),
|
||||
(["-O0"], "0"),
|
||||
(["-O1"], "1"),
|
||||
(["-O2"], "2"),
|
||||
(["-O3"], "3"),
|
||||
],
|
||||
)
|
||||
def test_optimization_level(args, expected):
|
||||
"""
|
||||
Test space-separated optimization levels (-O 1, -O 2, -O 3) map to
|
||||
optimization_level.
|
||||
"""
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
parsed_args = parser.parse_args(args)
|
||||
assert parsed_args.optimization_level == expected
|
||||
assert parsed_args.compilation_config.mode is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("args", "expected"),
|
||||
[
|
||||
(["-O.mode=0"], 0),
|
||||
(["-O.mode=1"], 1),
|
||||
(["-O.mode=2"], 2),
|
||||
(["-O.mode=3"], 3),
|
||||
],
|
||||
)
|
||||
def test_mode_parser(args, expected):
|
||||
"""
|
||||
Test compilation config modes (-O.mode=int) map to compilation_config.
|
||||
"""
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
parsed_args = parser.parse_args(args)
|
||||
assert parsed_args.compilation_config.mode == expected
|
||||
|
||||
|
||||
def test_compilation_config():
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
|
||||
@ -229,34 +270,17 @@ def test_compilation_config():
|
||||
args = parser.parse_args([])
|
||||
assert args.compilation_config == CompilationConfig()
|
||||
|
||||
# set to O3
|
||||
args = parser.parse_args(["-O0"])
|
||||
assert args.compilation_config.mode == 0
|
||||
|
||||
# set to O 3 (space)
|
||||
args = parser.parse_args(["-O", "1"])
|
||||
assert args.compilation_config.mode == 1
|
||||
|
||||
# set to O 3 (equals)
|
||||
args = parser.parse_args(["-O=2"])
|
||||
assert args.compilation_config.mode == 2
|
||||
|
||||
# set to O.mode 3
|
||||
args = parser.parse_args(["-O.mode", "3"])
|
||||
assert args.compilation_config.mode == 3
|
||||
|
||||
# set to string form of a dict
|
||||
args = parser.parse_args(
|
||||
[
|
||||
"-O",
|
||||
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
|
||||
'"use_inductor": false}',
|
||||
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
|
||||
]
|
||||
)
|
||||
assert (
|
||||
args.compilation_config.mode == 3
|
||||
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
|
||||
and not args.compilation_config.use_inductor
|
||||
and args.compilation_config.backend == "eager"
|
||||
)
|
||||
|
||||
# set to string form of a dict
|
||||
@ -264,13 +288,13 @@ def test_compilation_config():
|
||||
[
|
||||
"--compilation-config="
|
||||
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
|
||||
'"use_inductor": true}',
|
||||
'"backend": "inductor"}',
|
||||
]
|
||||
)
|
||||
assert (
|
||||
args.compilation_config.mode == 3
|
||||
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
|
||||
and args.compilation_config.use_inductor
|
||||
and args.compilation_config.backend == "inductor"
|
||||
)
|
||||
|
||||
|
||||
@ -278,8 +302,9 @@ def test_prefix_cache_default():
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
args = parser.parse_args([])
|
||||
|
||||
# should be None by default (depends on model).
|
||||
engine_args = EngineArgs.from_cli_args(args=args)
|
||||
assert engine_args.enable_prefix_caching, "prefix caching should default to on."
|
||||
assert engine_args.enable_prefix_caching is None
|
||||
|
||||
# with flag to turn it on.
|
||||
args = parser.parse_args(["--enable-prefix-caching"])
|
||||
|
||||
@ -183,9 +183,6 @@ async def test_metrics_counts(
|
||||
EXPECTED_METRICS_V1 = [
|
||||
"vllm:num_requests_running",
|
||||
"vllm:num_requests_waiting",
|
||||
"vllm:gpu_cache_usage_perc",
|
||||
"vllm:gpu_prefix_cache_queries",
|
||||
"vllm:gpu_prefix_cache_hits",
|
||||
"vllm:kv_cache_usage_perc",
|
||||
"vllm:prefix_cache_queries",
|
||||
"vllm:prefix_cache_hits",
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import importlib
|
||||
import json
|
||||
import time
|
||||
|
||||
@ -35,6 +35,10 @@ GET_WEATHER_SCHEMA = {
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
assert importlib.util.find_spec("gpt_oss") is not None, (
|
||||
"Harmony tests require gpt_oss package to be installed"
|
||||
)
|
||||
|
||||
args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
|
||||
env_dict = dict(
|
||||
VLLM_ENABLE_RESPONSES_API_STORE="1",
|
||||
|
||||
@ -2,20 +2,12 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# imports for structured outputs tests
|
||||
import io
|
||||
import json
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import soundfile as sf
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "openai/whisper-large-v3-turbo"
|
||||
SERVER_ARGS = ["--enforce-eager"]
|
||||
MISTRAL_FORMAT_ARGS = [
|
||||
"--tokenizer_mode",
|
||||
"mistral",
|
||||
@ -26,22 +18,8 @@ MISTRAL_FORMAT_ARGS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name", ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"]
|
||||
)
|
||||
@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
|
||||
async def test_basic_audio(mary_had_lamb, model_name):
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
@ -120,176 +98,3 @@ async def test_basic_audio_gemma(foscolo):
|
||||
)
|
||||
out = json.loads(transcription)["text"]
|
||||
assert "da cui vergine nacque Venere" in out
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_asr_model(winning_call):
|
||||
# text to text model
|
||||
model_name = "JackFram/llama-68m"
|
||||
with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
res = await client.audio.transcriptions.create(
|
||||
model=model_name, file=winning_call, language="en", temperature=0.0
|
||||
)
|
||||
err = res.error
|
||||
assert err["code"] == 400 and not res.text
|
||||
assert err["message"] == "The model does not support Transcriptions API"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_bad_requests(mary_had_lamb, client):
|
||||
# invalid language
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.audio.transcriptions.create(
|
||||
model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_long_audio_request(mary_had_lamb, client):
|
||||
mary_had_lamb.seek(0)
|
||||
audio, sr = librosa.load(mary_had_lamb)
|
||||
# Add small silence after each audio for repeatability in the split process
|
||||
audio = np.pad(audio, (0, 1600))
|
||||
repeated_audio = np.tile(audio, 10)
|
||||
# Repeated audio to buffer
|
||||
buffer = io.BytesIO()
|
||||
sf.write(buffer, repeated_audio, sr, format="WAV")
|
||||
buffer.seek(0)
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=buffer,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(transcription)
|
||||
out_text = out["text"]
|
||||
out_usage = out["usage"]
|
||||
counts = out_text.count("Mary had a little lamb")
|
||||
assert counts == 10, counts
|
||||
assert out_usage["seconds"] == 161, out_usage["seconds"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_completion_endpoints(client):
|
||||
# text to text model
|
||||
res = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{"role": "system", "content": "You are a helpful assistant."}],
|
||||
)
|
||||
err = res.error
|
||||
assert err["code"] == 400
|
||||
assert err["message"] == "The model does not support Chat Completions API"
|
||||
|
||||
res = await client.completions.create(model=MODEL_NAME, prompt="Hello")
|
||||
err = res.error
|
||||
assert err["code"] == 400
|
||||
assert err["message"] == "The model does not support Completions API"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_streaming_response(winning_call, client):
|
||||
transcription = ""
|
||||
res_no_stream = await client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=winning_call,
|
||||
response_format="json",
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
)
|
||||
res = await client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=winning_call,
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
timeout=30,
|
||||
)
|
||||
# Reconstruct from chunks and validate
|
||||
async for chunk in res:
|
||||
text = chunk.choices[0]["delta"]["content"]
|
||||
transcription += text
|
||||
|
||||
assert transcription == res_no_stream.text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stream_options(winning_call, client):
|
||||
res = await client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=winning_call,
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
|
||||
timeout=30,
|
||||
)
|
||||
final = False
|
||||
continuous = True
|
||||
async for chunk in res:
|
||||
if not len(chunk.choices):
|
||||
# final usage sent
|
||||
final = True
|
||||
else:
|
||||
continuous = continuous and hasattr(chunk, "usage")
|
||||
assert final and continuous
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_sampling_params(mary_had_lamb, client):
|
||||
"""
|
||||
Compare sampling with params and greedy sampling to assert results
|
||||
are different when extreme sampling parameters values are picked.
|
||||
"""
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
temperature=0.8,
|
||||
extra_body=dict(
|
||||
seed=42,
|
||||
repetition_penalty=1.9,
|
||||
top_k=12,
|
||||
top_p=0.4,
|
||||
min_p=0.5,
|
||||
frequency_penalty=1.8,
|
||||
presence_penalty=2.0,
|
||||
),
|
||||
)
|
||||
|
||||
greedy_transcription = await client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
extra_body=dict(seed=42),
|
||||
)
|
||||
|
||||
assert greedy_transcription.text != transcription.text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_audio_prompt(mary_had_lamb, client):
|
||||
prompt = "This is a speech, recorded in a phonograph."
|
||||
# Prompts should not omit the part of original prompt while transcribing.
|
||||
prefix = "The first words I spoke in the original phonograph"
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(transcription)["text"]
|
||||
assert prefix in out
|
||||
transcription_wprompt = await client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
prompt=prompt,
|
||||
temperature=0.0,
|
||||
)
|
||||
out_prompt = json.loads(transcription_wprompt)["text"]
|
||||
assert prefix in out_prompt
|
||||
|
||||
@ -0,0 +1,237 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# imports for structured outputs tests
|
||||
import asyncio
|
||||
import io
|
||||
import json
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import soundfile as sf
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "openai/whisper-large-v3-turbo"
|
||||
SERVER_ARGS = ["--enforce-eager"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def whisper_client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio(mary_had_lamb):
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(MODEL_NAME, server_args) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(transcription)
|
||||
out_text = out["text"]
|
||||
out_usage = out["usage"]
|
||||
assert "Mary had a little lamb," in out_text
|
||||
assert out_usage["seconds"] == 16, out_usage["seconds"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_audio_batched(mary_had_lamb, winning_call, whisper_client):
|
||||
transcription = whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
transcription2 = whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=winning_call,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
# Await both transcriptions by scheduling coroutines together
|
||||
transcription, transcription2 = await asyncio.gather(transcription, transcription2)
|
||||
out = json.loads(transcription)
|
||||
out_text = out["text"]
|
||||
assert "Mary had a little lamb," in out_text
|
||||
out2 = json.loads(transcription2)
|
||||
out_text2 = out2["text"]
|
||||
assert "Edgar Martinez" in out_text2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_bad_requests(mary_had_lamb, whisper_client):
|
||||
# invalid language
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_long_audio_request(mary_had_lamb, whisper_client):
|
||||
mary_had_lamb.seek(0)
|
||||
audio, sr = librosa.load(mary_had_lamb)
|
||||
# Add small silence after each audio for repeatability in the split process
|
||||
audio = np.pad(audio, (0, 1600))
|
||||
repeated_audio = np.tile(audio, 10)
|
||||
# Repeated audio to buffer
|
||||
buffer = io.BytesIO()
|
||||
sf.write(buffer, repeated_audio, sr, format="WAV")
|
||||
buffer.seek(0)
|
||||
transcription = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=buffer,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(transcription)
|
||||
out_text = out["text"]
|
||||
out_usage = out["usage"]
|
||||
counts = out_text.count("Mary had a little lamb")
|
||||
assert counts == 10, counts
|
||||
assert out_usage["seconds"] == 161, out_usage["seconds"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_completion_endpoints(whisper_client):
|
||||
# text to text model
|
||||
res = await whisper_client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=[{"role": "system", "content": "You are a helpful assistant."}],
|
||||
)
|
||||
err = res.error
|
||||
assert err["code"] == 400
|
||||
assert err["message"] == "The model does not support Chat Completions API"
|
||||
|
||||
res = await whisper_client.completions.create(model=MODEL_NAME, prompt="Hello")
|
||||
err = res.error
|
||||
assert err["code"] == 400
|
||||
assert err["message"] == "The model does not support Completions API"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_streaming_response(winning_call, whisper_client):
|
||||
transcription = ""
|
||||
res_no_stream = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=winning_call,
|
||||
response_format="json",
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
)
|
||||
res = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=winning_call,
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
timeout=30,
|
||||
)
|
||||
# Reconstruct from chunks and validate
|
||||
async for chunk in res:
|
||||
text = chunk.choices[0]["delta"]["content"]
|
||||
transcription += text
|
||||
|
||||
assert transcription == res_no_stream.text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stream_options(winning_call, whisper_client):
|
||||
res = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=winning_call,
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
|
||||
timeout=30,
|
||||
)
|
||||
final = False
|
||||
continuous = True
|
||||
async for chunk in res:
|
||||
if not len(chunk.choices):
|
||||
# final usage sent
|
||||
final = True
|
||||
else:
|
||||
continuous = continuous and hasattr(chunk, "usage")
|
||||
assert final and continuous
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_sampling_params(mary_had_lamb, whisper_client):
|
||||
"""
|
||||
Compare sampling with params and greedy sampling to assert results
|
||||
are different when extreme sampling parameters values are picked.
|
||||
"""
|
||||
transcription = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
temperature=0.8,
|
||||
extra_body=dict(
|
||||
seed=42,
|
||||
repetition_penalty=1.9,
|
||||
top_k=12,
|
||||
top_p=0.4,
|
||||
min_p=0.5,
|
||||
frequency_penalty=1.8,
|
||||
presence_penalty=2.0,
|
||||
),
|
||||
)
|
||||
|
||||
greedy_transcription = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
temperature=0.0,
|
||||
extra_body=dict(seed=42),
|
||||
)
|
||||
|
||||
assert greedy_transcription.text != transcription.text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_audio_prompt(mary_had_lamb, whisper_client):
|
||||
prompt = "This is a speech, recorded in a phonograph."
|
||||
# Prompts should not omit the part of original prompt while transcribing.
|
||||
prefix = "The first words I spoke in the original phonograph"
|
||||
transcription = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(transcription)["text"]
|
||||
assert prefix in out
|
||||
transcription_wprompt = await whisper_client.audio.transcriptions.create(
|
||||
model=MODEL_NAME,
|
||||
file=mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
prompt=prompt,
|
||||
temperature=0.0,
|
||||
)
|
||||
out_prompt = json.loads(transcription_wprompt)["text"]
|
||||
assert prefix in out_prompt
|
||||
@ -1,6 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
|
||||
@ -132,3 +134,129 @@ def test_extract_tool_calls_multiple_json_with_surrounding_text(parser):
|
||||
assert result.tool_calls[0].function.name == "searchTool"
|
||||
assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
|
||||
assert result.tool_calls[2].function.name == "searchTool"
|
||||
|
||||
|
||||
def test_extract_tool_calls_deeply_nested_json(parser):
|
||||
# Test with deeply nested JSON parameters (5 levels)
|
||||
model_output = (
|
||||
'{"name": "complexTool", '
|
||||
'"parameters": {'
|
||||
'"level1": {'
|
||||
'"level2": {'
|
||||
'"level3": {'
|
||||
'"level4": {'
|
||||
'"value": "deep"'
|
||||
"}}}}}}"
|
||||
)
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is True
|
||||
assert len(result.tool_calls) == 1
|
||||
assert result.tool_calls[0].function.name == "complexTool"
|
||||
# Verify the nested structure is preserved in the arguments
|
||||
import json
|
||||
|
||||
args = json.loads(result.tool_calls[0].function.arguments)
|
||||
assert args["level1"]["level2"]["level3"]["level4"]["value"] == "deep"
|
||||
|
||||
|
||||
def test_extract_tool_calls_multiple_with_deep_nesting(parser):
|
||||
# Test with multiple tool calls where some have deeply nested parameters
|
||||
model_output = (
|
||||
'{"name": "simpleTool", "parameters": {"value": "test"}}; '
|
||||
'{"name": "complexTool", "parameters": '
|
||||
'{"config": {"database": {"connection": {"pool": {"size": 10}}}}}}'
|
||||
)
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is True
|
||||
assert len(result.tool_calls) == 2
|
||||
|
||||
# Check first tool call
|
||||
assert result.tool_calls[0].function.name == "simpleTool"
|
||||
import json
|
||||
|
||||
args0 = json.loads(result.tool_calls[0].function.arguments)
|
||||
assert args0["value"] == "test"
|
||||
|
||||
# Check second tool call with deep nesting
|
||||
assert result.tool_calls[1].function.name == "complexTool"
|
||||
args1 = json.loads(result.tool_calls[1].function.arguments)
|
||||
assert args1["config"]["database"]["connection"]["pool"]["size"] == 10
|
||||
|
||||
|
||||
def test_extract_tool_calls_with_quotes_and_brackets_in_string(parser):
|
||||
# Test with quotes and brackets inside quoted string values
|
||||
model_output = (
|
||||
'{"name": "searchTool", '
|
||||
'"parameters": {'
|
||||
'"query": "test {value} [complex]",'
|
||||
'"nested": {"inner": "more {brackets}"}'
|
||||
"}}"
|
||||
)
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is True
|
||||
assert len(result.tool_calls) == 1
|
||||
assert result.tool_calls[0].function.name == "searchTool"
|
||||
# Verify the string values are preserved including brackets and quotes
|
||||
import json
|
||||
|
||||
args = json.loads(result.tool_calls[0].function.arguments)
|
||||
assert args["query"] == "test {value} [complex]"
|
||||
assert args["nested"]["inner"] == "more {brackets}"
|
||||
|
||||
|
||||
def test_extract_tool_calls_with_escaped_quotes_in_nested_json(parser):
|
||||
# Test with escaped quotes in deeply nested JSON
|
||||
model_output = (
|
||||
'{"name": "parserTool", "parameters": {"text": "He said \\"Hello {world}\\""}}'
|
||||
)
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is True
|
||||
assert len(result.tool_calls) == 1
|
||||
assert result.tool_calls[0].function.name == "parserTool"
|
||||
# Verify escaped quotes are preserved
|
||||
import json
|
||||
|
||||
args = json.loads(result.tool_calls[0].function.arguments)
|
||||
assert args["text"] == 'He said "Hello {world}"'
|
||||
|
||||
|
||||
def test_extract_tool_calls_missing_name_key(parser):
|
||||
# Test that missing "name" key returns content
|
||||
model_output = '{"parameters": {}}'
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is False
|
||||
assert len(result.tool_calls) == 0
|
||||
assert result.content == model_output
|
||||
|
||||
|
||||
def test_extract_tool_calls_missing_parameters_and_arguments_key(parser):
|
||||
# Test that missing both "parameters" and "arguments" keys returns content
|
||||
model_output = '{"name": "toolWithoutParams"}'
|
||||
result = parser.extract_tool_calls(model_output, None)
|
||||
|
||||
assert result.tools_called is False
|
||||
assert len(result.tool_calls) == 0
|
||||
assert result.content == model_output
|
||||
|
||||
|
||||
def test_regex_timeout_handling(parser):
|
||||
"""Test regex timeout is handled gracefully"""
|
||||
fake_problematic_input = "{hello world[A(A=" + "\t)A(A=,\t" * 2
|
||||
|
||||
# create a mock regex that raises TimeoutError
|
||||
mock_regex = MagicMock()
|
||||
mock_regex.finditer.side_effect = TimeoutError("Regex timeout")
|
||||
|
||||
with patch.object(parser, "tool_call_start_regex", mock_regex):
|
||||
result = parser.extract_tool_calls(fake_problematic_input, None)
|
||||
|
||||
# should treat as regular text when regex times out
|
||||
assert result.content == fake_problematic_input
|
||||
assert result.tools_called is False
|
||||
assert len(result.tool_calls) == 0
|
||||
mock_regex.finditer.assert_called_once()
|
||||
|
||||
0
tests/entrypoints/pooling/pooling/__init__.py
Normal file
0
tests/entrypoints/pooling/pooling/__init__.py
Normal file
0
tests/entrypoints/pooling/reward/__init__.py
Normal file
0
tests/entrypoints/pooling/reward/__init__.py
Normal file
0
tests/entrypoints/pooling/score/__init__.py
Normal file
0
tests/entrypoints/pooling/score/__init__.py
Normal file
@ -2,6 +2,9 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
from openai.types.responses.response_function_tool_call_output_item import (
|
||||
ResponseFunctionToolCallOutputItem,
|
||||
)
|
||||
from openai.types.responses.response_reasoning_item import (
|
||||
Content,
|
||||
ResponseReasoningItem,
|
||||
@ -76,6 +79,18 @@ class TestResponsesUtils:
|
||||
== 'Hmm, the user has just started with a simple "Hello,"'
|
||||
)
|
||||
|
||||
tool_call_output = ResponseFunctionToolCallOutputItem(
|
||||
id="temp_id",
|
||||
type="function_call_output",
|
||||
call_id="temp",
|
||||
output="1234",
|
||||
status="completed",
|
||||
)
|
||||
formatted_item = construct_chat_message_with_tool_call(tool_call_output)
|
||||
assert formatted_item["role"] == "tool"
|
||||
assert formatted_item["content"] == "1234"
|
||||
assert formatted_item["tool_call_id"] == "temp"
|
||||
|
||||
item = ResponseReasoningItem(
|
||||
id="lol",
|
||||
summary=[],
|
||||
|
||||
@ -13,12 +13,6 @@ from vllm.attention.layer import Attention, MultiHeadAttention
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.mem_utils import get_max_shared_memory_bytes
|
||||
|
||||
if not current_platform.is_rocm():
|
||||
from xformers import ops as xops
|
||||
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
|
||||
|
||||
from tests.kernels.utils import make_alibi_bias
|
||||
|
||||
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
||||
# This will change depending on the compute capability.
|
||||
# - 512 as a buffer
|
||||
@ -448,129 +442,6 @@ def ref_multi_query_kv_attention(
|
||||
return torch.cat(ref_outputs, dim=0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
|
||||
)
|
||||
@torch.inference_mode()
|
||||
def test_multi_query_kv_attention(
|
||||
num_seqs: int,
|
||||
num_heads: tuple[int, int],
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
use_alibi: bool = False,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
|
||||
# As the xformers library is already tested with its own tests, we can use
|
||||
# a smaller MAX_SEQ_LEN here.
|
||||
max_len = min(MAX_SEQ_LEN, 4096)
|
||||
seq_lens = random.sample(range(1, max_len), num_seqs)
|
||||
num_tokens = sum(seq_lens)
|
||||
|
||||
scale = float(1.0 / (head_size**0.5))
|
||||
num_query_heads, num_kv_heads = num_heads
|
||||
qkv = torch.empty(
|
||||
num_tokens, num_query_heads + 2 * num_kv_heads, head_size, dtype=dtype
|
||||
)
|
||||
qkv.uniform_(-scale, scale)
|
||||
query, key, value = qkv.split([num_query_heads, num_kv_heads, num_kv_heads], dim=1)
|
||||
|
||||
num_queries_per_kv = num_query_heads // num_kv_heads
|
||||
if num_queries_per_kv > 1:
|
||||
# Handle MQA and GQA
|
||||
key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
|
||||
value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
|
||||
alibi_bias = None
|
||||
if use_alibi:
|
||||
alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
|
||||
attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
|
||||
output = torch.empty_like(query)
|
||||
start = 0
|
||||
# Dynamic sequence length not supported with custom attn_bias.
|
||||
for i, seq_len in enumerate(seq_lens):
|
||||
end = start + seq_len
|
||||
out = xops.memory_efficient_attention_forward(
|
||||
query[None, start:end],
|
||||
key[None, start:end],
|
||||
value[None, start:end],
|
||||
attn_bias=attn_bias[i],
|
||||
p=0.0,
|
||||
scale=scale,
|
||||
)
|
||||
output[start:end].copy_(out.view_as(query[start:end]))
|
||||
start += seq_len
|
||||
# xformers.AttentionBias to Tensor for use in reference impl.
|
||||
alibi_bias = [
|
||||
b.materialize((1, num_query_heads, i, i), device=device).squeeze()
|
||||
for b, i in zip(attn_bias, seq_lens)
|
||||
]
|
||||
else:
|
||||
attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
|
||||
output = xops.memory_efficient_attention_forward(
|
||||
query.unsqueeze(0),
|
||||
key.unsqueeze(0),
|
||||
value.unsqueeze(0),
|
||||
attn_bias=attn_bias,
|
||||
p=0.0,
|
||||
scale=scale,
|
||||
)
|
||||
output = output.squeeze(0)
|
||||
|
||||
cu_seq_lens = [0]
|
||||
for seq_len in seq_lens:
|
||||
cu_seq_lens.append(cu_seq_lens[-1] + seq_len)
|
||||
ref_output = ref_multi_query_kv_attention(
|
||||
cu_seq_lens,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
scale,
|
||||
alibi_bias,
|
||||
dtype,
|
||||
)
|
||||
atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
|
||||
rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
|
||||
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", [64])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("seed", SEEDS)
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
|
||||
)
|
||||
@torch.inference_mode()
|
||||
def test_multi_query_kv_attention_with_alibi(
|
||||
num_seqs: int,
|
||||
num_heads: tuple[int, int],
|
||||
head_size: int,
|
||||
dtype: torch.dtype,
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
return test_multi_query_kv_attention(
|
||||
num_seqs,
|
||||
num_heads,
|
||||
head_size,
|
||||
dtype,
|
||||
seed,
|
||||
device,
|
||||
use_alibi=True,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
|
||||
def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
|
||||
head_size = 64
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user