mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-02 13:57:12 +08:00
Merge branch 'main' into rename_file_info_to_pkg/file
This commit is contained in:
commit
d194acec25
@ -25,20 +25,22 @@ function cpu_tests() {
|
||||
|
||||
# offline inference
|
||||
podman exec -it "$container_id" bash -c "
|
||||
export TORCH_COMPILE_DISABLE=1
|
||||
set -xve
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
||||
|
||||
# Run basic model test
|
||||
podman exec -it "$container_id" bash -c "
|
||||
export TORCH_COMPILE_DISABLE=1
|
||||
set -evx
|
||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||
pip install sentence-transformers datamodel_code_generator
|
||||
pip install sentence-transformers datamodel_code_generator tblib
|
||||
|
||||
# Note: disable Bart until supports V1
|
||||
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
|
||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
||||
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
||||
|
||||
@ -17,7 +17,17 @@ wait_for_server() {
|
||||
}
|
||||
|
||||
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
|
||||
# Set BACKENDS based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
|
||||
@ -17,7 +17,16 @@ wait_for_server() {
|
||||
}
|
||||
|
||||
MODEL="QWen/Qwen3-30B-A3B-FP8"
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
# Set BACKENDS based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
|
||||
@ -754,6 +754,7 @@ steps:
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/
|
||||
- vllm/transformers_utils/
|
||||
- tests/models/test_initialization.py
|
||||
commands:
|
||||
# Only when vLLM model source is modified - test initialization of a large
|
||||
@ -1319,7 +1320,10 @@ steps:
|
||||
- pytest -v -s -x lora/test_llama_tp.py
|
||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
|
||||
# Disabled for now because MXFP4 backend on non-cuda platform
|
||||
# doesn't support LoRA yet
|
||||
#- pytest -v -s -x lora/test_gptoss_tp.py
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
|
||||
@ -346,6 +346,18 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s v1/attention
|
||||
|
||||
- label: Batch Invariance Tests (H100) # 10min
|
||||
timeout_in_minutes: 25
|
||||
gpu: h100
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1/determinism/
|
||||
commands:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pip install pytest-timeout pytest-forked
|
||||
- pytest -v -s v1/determinism/test_batch_invariance.py
|
||||
- pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
|
||||
|
||||
- label: V1 Test attention (B200) # 10min
|
||||
timeout_in_minutes: 30
|
||||
gpu: b200
|
||||
@ -679,6 +691,7 @@ steps:
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/models/
|
||||
- vllm/transformers_utils/
|
||||
- tests/models/test_initialization.py
|
||||
commands:
|
||||
# Only when vLLM model source is modified - test initialization of a large
|
||||
|
||||
5
.github/CODEOWNERS
vendored
5
.github/CODEOWNERS
vendored
@ -9,6 +9,7 @@
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
/vllm/model_executor/model_loader @22quinn
|
||||
/vllm/model_executor/layers/batch_invariant.py @yewentao256
|
||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
/vllm/lora @jeejeelee
|
||||
@ -35,6 +36,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||
/vllm/v1/offloading @ApostaC
|
||||
|
||||
# Model runner V2
|
||||
/vllm/v1/worker/gpu @WoosukKwon
|
||||
|
||||
# Test ownership
|
||||
/.buildkite/lm-eval-harness @mgoin
|
||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||
@ -56,6 +60,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||
/tests/v1/kv_connector @ApostaC
|
||||
/tests/v1/offloading @ApostaC
|
||||
/tests/v1/determinism @yewentao256
|
||||
|
||||
# Transformers modeling backend
|
||||
/vllm/model_executor/models/transformers @hmellor
|
||||
|
||||
@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 71bb26f6295449be880344b93b51791cc009237d
|
||||
GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
|
||||
@ -13,6 +13,18 @@
|
||||
#define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
|
||||
#endif
|
||||
|
||||
#ifdef __aarch64__
|
||||
#include "cpu_attn_neon.hpp"
|
||||
#define NEON_DISPATCH(...) \
|
||||
case cpu_attention::ISA::NEON: { \
|
||||
using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
|
||||
scalar_t, head_dim>; \
|
||||
return __VA_ARGS__(); \
|
||||
}
|
||||
#else
|
||||
#define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
|
||||
#endif // #ifdef __aarch64__
|
||||
|
||||
#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
|
||||
case HEAD_DIM: { \
|
||||
constexpr size_t head_dim = HEAD_DIM; \
|
||||
@ -41,6 +53,7 @@
|
||||
[&] { \
|
||||
switch (ISA_TYPE) { \
|
||||
AMX_DISPATCH(__VA_ARGS__) \
|
||||
NEON_DISPATCH(__VA_ARGS__) \
|
||||
case cpu_attention::ISA::VEC: { \
|
||||
using attn_impl = \
|
||||
cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t, \
|
||||
@ -73,6 +86,8 @@ torch::Tensor get_scheduler_metadata(
|
||||
isa = cpu_attention::ISA::VEC;
|
||||
} else if (isa_hint == "vec16") {
|
||||
isa = cpu_attention::ISA::VEC16;
|
||||
} else if (isa_hint == "neon") {
|
||||
isa = cpu_attention::ISA::NEON;
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
|
||||
}
|
||||
@ -158,6 +173,8 @@ void cpu_attn_reshape_and_cache(
|
||||
return cpu_attention::ISA::VEC;
|
||||
} else if (isa == "vec16") {
|
||||
return cpu_attention::ISA::VEC16;
|
||||
} else if (isa == "neon") {
|
||||
return cpu_attention::ISA::NEON;
|
||||
} else {
|
||||
TORCH_CHECK(false, "Invalid ISA type: " + isa);
|
||||
}
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
#include "utils.hpp"
|
||||
|
||||
namespace cpu_attention {
|
||||
enum class ISA { AMX, VEC, VEC16 };
|
||||
enum class ISA { AMX, VEC, VEC16, NEON };
|
||||
|
||||
template <ISA isa, typename scalar_t, int64_t head_dim>
|
||||
class AttentionImpl {};
|
||||
@ -143,6 +143,12 @@ struct AttentionMetadata {
|
||||
case ISA::VEC:
|
||||
ss << "VEC, ";
|
||||
break;
|
||||
case ISA::VEC16:
|
||||
ss << "VEC16, ";
|
||||
break;
|
||||
case ISA::NEON:
|
||||
ss << "NEON, ";
|
||||
break;
|
||||
}
|
||||
ss << "workitem_group_num: " << workitem_group_num
|
||||
<< ", reduction_item_num: " << reduction_item_num
|
||||
|
||||
386
csrc/cpu/cpu_attn_neon.hpp
Normal file
386
csrc/cpu/cpu_attn_neon.hpp
Normal file
@ -0,0 +1,386 @@
|
||||
#ifndef CPU_ATTN_NEON_HPP
|
||||
#define CPU_ATTN_NEON_HPP
|
||||
|
||||
#include "cpu_attn_impl.hpp"
|
||||
#include <arm_neon.h>
|
||||
#include <type_traits>
|
||||
namespace cpu_attention {
|
||||
|
||||
namespace {
|
||||
|
||||
#define BLOCK_SIZE_ALIGNMENT 32
|
||||
#define HEAD_SIZE_ALIGNMENT 32
|
||||
#define MAX_Q_HEAD_NUM_PER_ITER 16
|
||||
|
||||
// These do not use vectorized class for loading / converting
|
||||
// because csrc/cpu/cpu_types_arm.hpp does not have fallback options
|
||||
// for vec_op::BF16Vec* / vec_op::BF16Vec* on Arm HW that
|
||||
// doesn't support BF16.
|
||||
// We don't use vec_op::FP32Vec* or vec_op::FP16Vec* for consistency.
|
||||
template <typename kv_cache_t>
|
||||
FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, float32x4_t& b0,
|
||||
float32x4_t& b1);
|
||||
|
||||
template <>
|
||||
FORCE_INLINE void load_row8_B_as_f32<float>(const float* p, float32x4_t& b0,
|
||||
float32x4_t& b1) {
|
||||
b0 = vld1q_f32(p + 0);
|
||||
b1 = vld1q_f32(p + 4);
|
||||
}
|
||||
|
||||
template <>
|
||||
FORCE_INLINE void load_row8_B_as_f32<c10::Half>(const c10::Half* p,
|
||||
float32x4_t& b0,
|
||||
float32x4_t& b1) {
|
||||
const float16_t* h = reinterpret_cast<const float16_t*>(p);
|
||||
float16x8_t v = vld1q_f16(h);
|
||||
b0 = vcvt_f32_f16(vget_low_f16(v));
|
||||
b1 = vcvt_f32_f16(vget_high_f16(v));
|
||||
}
|
||||
|
||||
template <>
|
||||
FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
|
||||
float32x4_t& b0,
|
||||
float32x4_t& b1) {
|
||||
const uint16_t* u = reinterpret_cast<const uint16_t*>(p);
|
||||
#ifdef ARM_BF16_SUPPORT
|
||||
uint16x8_t u0 = vld1q_u16(u);
|
||||
bfloat16x8_t bf0 = vreinterpretq_bf16_u16(u0);
|
||||
b0 = vcvtq_low_f32_bf16(bf0);
|
||||
b1 = vcvtq_high_f32_bf16(bf0);
|
||||
#else
|
||||
uint16x8_t x0 = vld1q_u16(u);
|
||||
uint32x4_t lo = vshlq_n_u32(vmovl_u16(vget_low_u16(x0)), 16);
|
||||
uint32x4_t hi = vshlq_n_u32(vmovl_u16(vget_high_u16(x0)), 16);
|
||||
b0 = vreinterpretq_f32_u32(lo);
|
||||
b1 = vreinterpretq_f32_u32(hi);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs
|
||||
// #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2)
|
||||
// #FMLAs = (K // 4) * (4 * 2 * M)
|
||||
// We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads
|
||||
template <int32_t M, typename kv_cache_t>
|
||||
FORCE_INLINE void gemm_micro_neon_fmla_Mx8_Ku4(
|
||||
const float* __restrict A, // [M x K],
|
||||
const kv_cache_t* __restrict B, // [K x 8],
|
||||
float* __restrict C, // [M x 8],
|
||||
int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) {
|
||||
// kernel supports max M of 8, as it'd spill for larger M
|
||||
static_assert(1 <= M && M <= 8, "M must be in [1,8]");
|
||||
|
||||
// helpers for per-M codegen
|
||||
#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7)
|
||||
#define IF_M(i) if constexpr (M > (i))
|
||||
|
||||
// A row base pointers
|
||||
#define DECL_A(i) const float* a##i = A + (i) * lda;
|
||||
ROWS_APPLY(DECL_A)
|
||||
#undef DECL_A
|
||||
|
||||
// declare 2 accumulators per row of M
|
||||
#define DECL_ACC(i) float32x4_t acc##i##_0, acc##i##_1;
|
||||
ROWS_APPLY(DECL_ACC)
|
||||
#undef DECL_ACC
|
||||
|
||||
// initialize accumulators
|
||||
#define INIT_ACC(i) \
|
||||
IF_M(i) { \
|
||||
if (accumulate) { \
|
||||
acc##i##_0 = vld1q_f32(C + (i) * ldc + 0); \
|
||||
acc##i##_1 = vld1q_f32(C + (i) * ldc + 4); \
|
||||
} else { \
|
||||
acc##i##_0 = vdupq_n_f32(0.f); \
|
||||
acc##i##_1 = vdupq_n_f32(0.f); \
|
||||
} \
|
||||
}
|
||||
ROWS_APPLY(INIT_ACC)
|
||||
#undef INIT_ACC
|
||||
|
||||
int32_t k = 0;
|
||||
|
||||
// K unrolled by 4
|
||||
for (; k + 3 < K; k += 4) {
|
||||
// load A[k..k+3] for each active row (M)
|
||||
#define LOAD_A4(i) \
|
||||
float32x4_t a##i##v; \
|
||||
IF_M(i) a##i##v = vld1q_f32(a##i + k);
|
||||
ROWS_APPLY(LOAD_A4)
|
||||
#undef LOAD_A4
|
||||
|
||||
// helper: FMA lane L from aiv
|
||||
#define FMAS_LANE(i, aiv, L) \
|
||||
IF_M(i) { \
|
||||
acc##i##_0 = vfmaq_laneq_f32(acc##i##_0, b0, aiv, L); \
|
||||
acc##i##_1 = vfmaq_laneq_f32(acc##i##_1, b1, aiv, L); \
|
||||
}
|
||||
|
||||
// k + 0
|
||||
{
|
||||
float32x4_t b0, b1;
|
||||
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 0) * ldb, b0, b1);
|
||||
#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0)
|
||||
ROWS_APPLY(STEP_K0)
|
||||
#undef STEP_K0
|
||||
}
|
||||
// k + 1
|
||||
{
|
||||
float32x4_t b0, b1;
|
||||
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 1) * ldb, b0, b1);
|
||||
#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1)
|
||||
ROWS_APPLY(STEP_K1)
|
||||
#undef STEP_K1
|
||||
}
|
||||
// k + 2
|
||||
{
|
||||
float32x4_t b0, b1;
|
||||
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 2) * ldb, b0, b1);
|
||||
#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2)
|
||||
ROWS_APPLY(STEP_K2)
|
||||
#undef STEP_K2
|
||||
}
|
||||
// k + 3
|
||||
{
|
||||
float32x4_t b0, b1;
|
||||
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 3) * ldb, b0, b1);
|
||||
#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3)
|
||||
ROWS_APPLY(STEP_K3)
|
||||
#undef STEP_K3
|
||||
}
|
||||
#undef FMAS_LANE
|
||||
}
|
||||
|
||||
// K tail
|
||||
for (; k < K; ++k) {
|
||||
float32x4_t b0, b1;
|
||||
load_row8_B_as_f32<kv_cache_t>(B + (int64_t)k * ldb, b0, b1);
|
||||
#define TAIL_ROW(i) \
|
||||
IF_M(i) { \
|
||||
float32x4_t ai = vdupq_n_f32(*(a##i + k)); \
|
||||
acc##i##_0 = vfmaq_f32(acc##i##_0, b0, ai); \
|
||||
acc##i##_1 = vfmaq_f32(acc##i##_1, b1, ai); \
|
||||
}
|
||||
ROWS_APPLY(TAIL_ROW)
|
||||
#undef TAIL_ROW
|
||||
}
|
||||
|
||||
// store accumulators to C
|
||||
#define STORE_ROW(i) \
|
||||
IF_M(i) { \
|
||||
vst1q_f32(C + (i) * ldc + 0, acc##i##_0); \
|
||||
vst1q_f32(C + (i) * ldc + 4, acc##i##_1); \
|
||||
}
|
||||
ROWS_APPLY(STORE_ROW)
|
||||
#undef STORE_ROW
|
||||
|
||||
#undef ROWS_APPLY
|
||||
#undef IF_M
|
||||
}
|
||||
|
||||
template <int32_t N, typename kv_cache_t>
|
||||
FORCE_INLINE void gemm_macro_neon_fmla_Mx8_Ku4(const float* __restrict A,
|
||||
const kv_cache_t* __restrict B,
|
||||
float* __restrict C, int32_t M,
|
||||
int32_t K, int64_t lda,
|
||||
int64_t ldb, int64_t ldc,
|
||||
bool accumulate) {
|
||||
// micro kernel is Mx8
|
||||
static_assert(N % 8 == 0, "N must be a multiple of 8");
|
||||
for (int32_t m = 0; m < M;) {
|
||||
int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1;
|
||||
const float* Ab = A + m * lda;
|
||||
float* Cb = C + m * ldc;
|
||||
|
||||
for (int32_t n = 0; n < N; n += 8) {
|
||||
const kv_cache_t* Bn = B + n;
|
||||
float* Cn = Cb + n;
|
||||
switch (mb) {
|
||||
case 8:
|
||||
gemm_micro_neon_fmla_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
|
||||
K, accumulate);
|
||||
break;
|
||||
case 4:
|
||||
gemm_micro_neon_fmla_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
|
||||
K, accumulate);
|
||||
break;
|
||||
case 2:
|
||||
gemm_micro_neon_fmla_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
|
||||
K, accumulate);
|
||||
break;
|
||||
default:
|
||||
gemm_micro_neon_fmla_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
|
||||
K, accumulate);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// no tail loop for N as it's guaranteed to be a multiple of 8
|
||||
m += mb;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename kv_cache_t>
|
||||
class TileGemmNeonFMLA {
|
||||
public:
|
||||
template <AttentionGemmPhase phase, int32_t k_size>
|
||||
FORCE_INLINE static void gemm(const int32_t m_size,
|
||||
float* __restrict__ a_tile,
|
||||
kv_cache_t* __restrict__ b_tile,
|
||||
float* __restrict__ c_tile, const int64_t lda,
|
||||
const int64_t ldb, const int64_t ldc,
|
||||
const int32_t block_size,
|
||||
const int32_t dynamic_k_size,
|
||||
const bool accum_c) {
|
||||
if constexpr (phase == AttentionGemmPhase::QK) {
|
||||
gemm_macro_neon_fmla_Mx8_Ku4<BLOCK_SIZE_ALIGNMENT, kv_cache_t>(
|
||||
a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c);
|
||||
} else {
|
||||
gemm_macro_neon_fmla_Mx8_Ku4<HEAD_SIZE_ALIGNMENT, kv_cache_t>(
|
||||
a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc,
|
||||
accum_c);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
// this is similar to "ISA::VEC" at the moment
|
||||
template <typename scalar_t, int64_t head_dim>
|
||||
class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
|
||||
public:
|
||||
using query_t = scalar_t;
|
||||
using q_buffer_t = float;
|
||||
using kv_cache_t = scalar_t;
|
||||
using logits_buffer_t = float;
|
||||
using partial_output_buffer_t = float;
|
||||
using prob_buffer_t = float;
|
||||
|
||||
constexpr static int64_t BlockSizeAlignment =
|
||||
BLOCK_SIZE_ALIGNMENT; // KV token num unit of QK and PV phases
|
||||
constexpr static int64_t HeadDimAlignment =
|
||||
HEAD_SIZE_ALIGNMENT; // headdim num unit of PV phase
|
||||
constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER;
|
||||
constexpr static int64_t HeadDim = head_dim;
|
||||
constexpr static ISA ISAType = ISA::NEON;
|
||||
constexpr static bool scale_on_logits = false; // apply scale on q_buffer
|
||||
|
||||
static_assert(HeadDim % HeadDimAlignment == 0);
|
||||
// the gemm micro kernel is Mx8
|
||||
static_assert(HeadDimAlignment % 8 == 0);
|
||||
static_assert(BlockSizeAlignment % 8 == 0);
|
||||
|
||||
public:
|
||||
template <template <typename tile_gemm_t> typename attention>
|
||||
FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
|
||||
attention<TileGemmNeonFMLA<kv_cache_t>> attention_iteration;
|
||||
attention_iteration(CPU_ATTENTION_PARAMS);
|
||||
}
|
||||
|
||||
// k_cache_token_group_stride: stride of K cache when move to next
|
||||
// BlockSizeAlignment tokens in a block
|
||||
constexpr static int64_t k_cache_token_group_stride(
|
||||
const int32_t block_size) {
|
||||
return BlockSizeAlignment; // layout of k_cache block is [head_dim,
|
||||
// block_size], row-major
|
||||
}
|
||||
|
||||
// v_cache_token_group_stride: stride of V cache when move to next
|
||||
// BlockSizeAlignment tokens in a block
|
||||
constexpr static int64_t v_cache_token_group_stride(
|
||||
const int32_t block_size) {
|
||||
return head_dim * BlockSizeAlignment; // layout of v_cache is [block_size,
|
||||
// head_dim], row-major
|
||||
}
|
||||
|
||||
// v_cache_head_group_stride: stride of V cache when move to next
|
||||
// HeadDimAlignment head dims in a block
|
||||
constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
|
||||
return HeadDimAlignment; // layout of v_cache is [block_size, head_dim],
|
||||
// row-major
|
||||
}
|
||||
|
||||
// Copy q to q_buffer and cast it to fp32
|
||||
static void copy_q_heads_tile(
|
||||
scalar_t* __restrict__ src, // [q_num, q_heads_per_kv, head_size]
|
||||
float* __restrict__ q_buffer, const int32_t q_num,
|
||||
const int32_t q_heads_per_kv, const int64_t q_num_stride,
|
||||
const int64_t q_head_stride, float scale) {
|
||||
static_assert(head_dim % 16 == 0);
|
||||
constexpr int32_t unroll_size = head_dim / 16;
|
||||
using load_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
|
||||
|
||||
vec_op::FP32Vec16 scale_vec(scale);
|
||||
for (int32_t q_num_idx = 0; q_num_idx < q_num; ++q_num_idx) {
|
||||
for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv; ++q_head_idx) {
|
||||
scalar_t* __restrict__ curr_q =
|
||||
src + q_num_idx * q_num_stride + q_head_idx * q_head_stride;
|
||||
float* __restrict__ curr_q_buffer =
|
||||
q_buffer + q_num_idx * q_heads_per_kv * head_dim +
|
||||
q_head_idx * head_dim;
|
||||
|
||||
vec_op::unroll_loop<int32_t, unroll_size>([&](int32_t i) {
|
||||
load_vec_t vec(curr_q);
|
||||
vec_op::FP32Vec16 fp32_vec(vec);
|
||||
fp32_vec = fp32_vec * scale_vec;
|
||||
fp32_vec.save(curr_q_buffer);
|
||||
|
||||
curr_q += 16;
|
||||
curr_q_buffer += 16;
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reshape K as column-major and V as row-major
|
||||
static void reshape_and_cache(
|
||||
const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
|
||||
scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
|
||||
const int64_t* __restrict__ slot_mapping, const int64_t token_num,
|
||||
const int64_t key_token_num_stride, const int64_t value_token_num_stride,
|
||||
const int64_t head_num, const int64_t key_head_num_stride,
|
||||
const int64_t value_head_num_stride, const int64_t num_blocks,
|
||||
const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
|
||||
const int64_t block_size, const int64_t block_size_stride) {
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
|
||||
for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
|
||||
const int64_t pos = slot_mapping[token_idx];
|
||||
if (pos < 0) {
|
||||
// skip
|
||||
continue;
|
||||
}
|
||||
|
||||
const int64_t block_idx = pos / block_size;
|
||||
const int64_t block_offset = pos % block_size;
|
||||
{
|
||||
// Write Key
|
||||
const scalar_t* key_start_ptr = key +
|
||||
token_idx * key_token_num_stride +
|
||||
head_idx * key_head_num_stride;
|
||||
scalar_t* key_cache_start_ptr =
|
||||
key_cache + block_idx * num_blocks_stride +
|
||||
head_idx * cache_head_num_stride + block_offset;
|
||||
|
||||
#pragma GCC unroll 8
|
||||
for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
|
||||
key_cache_start_ptr[j] = key_start_ptr[i];
|
||||
}
|
||||
}
|
||||
{
|
||||
// Write Value
|
||||
const scalar_t* value_start_ptr = value +
|
||||
token_idx * value_token_num_stride +
|
||||
head_idx * value_head_num_stride;
|
||||
scalar_t* value_cache_start_ptr =
|
||||
value_cache + block_idx * num_blocks_stride +
|
||||
head_idx * cache_head_num_stride + block_offset * head_dim;
|
||||
std::memcpy(value_cache_start_ptr, value_start_ptr,
|
||||
sizeof(scalar_t) * head_dim);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace cpu_attention
|
||||
|
||||
#endif // #ifndef CPU_ATTN_NEON_HPP
|
||||
@ -45,31 +45,54 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||
// Memory node binding
|
||||
if (numa_available() != -1) {
|
||||
int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
|
||||
// Verify all CPUs are on the same NUMA node
|
||||
for (size_t i = 1; i < omp_cpu_ids.size(); ++i) {
|
||||
int node_id = numa_node_of_cpu(omp_cpu_ids[i]);
|
||||
TORCH_CHECK(node_id == mem_node_id, "CPU ", omp_cpu_ids[i],
|
||||
" is on NUMA node ", node_id, ", but CPU ",
|
||||
omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
|
||||
". All CPUs should be on the same NUMA node for optimal "
|
||||
"performance. Memory will be bound to NUMA node ",
|
||||
mem_node_id, ".");
|
||||
std::set<int> node_ids;
|
||||
for (const auto& cpu_id : omp_cpu_ids) {
|
||||
int node_id = numa_node_of_cpu(cpu_id);
|
||||
if (node_id != -1) {
|
||||
node_ids.insert(node_id);
|
||||
}
|
||||
TORCH_WARN(node_id == mem_node_id, "CPU ", cpu_id, " is on NUMA node ",
|
||||
node_id, ", but CPU ", omp_cpu_ids.front(),
|
||||
" is on NUMA node ", mem_node_id,
|
||||
". All CPUs should be on the same NUMA node for optimal "
|
||||
"performance. Memory will be bound to NUMA node ",
|
||||
mem_node_id, ".");
|
||||
}
|
||||
bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str());
|
||||
bitmask* src_mask = numa_get_membind();
|
||||
// Concatenate all node_ids into a single comma-separated string
|
||||
if (!node_ids.empty()) {
|
||||
std::string node_ids_str;
|
||||
for (const int node_id : node_ids) {
|
||||
if (!node_ids_str.empty()) {
|
||||
node_ids_str += ",";
|
||||
}
|
||||
node_ids_str += std::to_string(node_id);
|
||||
}
|
||||
|
||||
int pid = getpid();
|
||||
bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
|
||||
bitmask* src_mask = numa_get_membind();
|
||||
|
||||
// move all existing pages to the specified numa node.
|
||||
*(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
|
||||
int page_num = numa_migrate_pages(pid, src_mask, mask);
|
||||
if (page_num == -1) {
|
||||
TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno));
|
||||
int pid = getpid();
|
||||
|
||||
if (mask && src_mask) {
|
||||
// move all existing pages to the specified numa node.
|
||||
*(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
|
||||
int page_num = numa_migrate_pages(pid, src_mask, mask);
|
||||
if (page_num == -1) {
|
||||
TORCH_WARN("numa_migrate_pages failed. errno: " +
|
||||
std::to_string(errno));
|
||||
}
|
||||
|
||||
// restrict memory allocation node.
|
||||
numa_set_membind(mask);
|
||||
numa_set_strict(1);
|
||||
|
||||
numa_free_nodemask(mask);
|
||||
numa_free_nodemask(src_mask);
|
||||
} else {
|
||||
TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " +
|
||||
std::to_string(errno));
|
||||
}
|
||||
}
|
||||
|
||||
// restrict memory allocation node.
|
||||
numa_set_membind(mask);
|
||||
numa_set_strict(1);
|
||||
}
|
||||
|
||||
// OMP threads binding
|
||||
|
||||
@ -22,15 +22,10 @@ torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
|
||||
auto strides = cpu_tensor.strides();
|
||||
auto options = cpu_tensor.options().device(torch::kCUDA);
|
||||
|
||||
// from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
|
||||
// const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
|
||||
// memory, so we don't free it here.
|
||||
auto deleter = [](void*) {
|
||||
// no-op, since the memory is owned by the original CPU tensor
|
||||
};
|
||||
|
||||
// use default no-op deleter, since the memory is owned by the original CPU
|
||||
// tensor
|
||||
torch::Tensor cuda_tensor =
|
||||
torch::from_blob(device_ptr, sizes, strides, deleter, options);
|
||||
torch::from_blob(device_ptr, sizes, strides, options);
|
||||
|
||||
TORCH_CHECK(cuda_tensor.device().is_cuda(),
|
||||
"Resulting tensor is not on CUDA device");
|
||||
|
||||
@ -85,7 +85,7 @@ ARG GET_PIP_URL
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl sudo python3-pip \
|
||||
&& apt-get install -y ccache software-properties-common git curl sudo python3-pip libibverbs-dev \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
|
||||
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
|
||||
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
|
||||
@ -224,6 +224,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
|
||||
fi
|
||||
|
||||
# Install DeepGEMM from source
|
||||
ARG DEEPGEMM_GIT_REF
|
||||
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} --wheel-dir /tmp/deepgemm/dist
|
||||
|
||||
# Ensure the wheel dir exists so later-stage COPY won't fail when DeepGEMM is skipped
|
||||
RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
|
||||
|
||||
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
|
||||
# Install EP kernels(pplx-kernels and DeepEP)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
|
||||
/tmp/install_python_libraries.sh /tmp/ep_kernels_workspace wheel && \
|
||||
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
|
||||
|
||||
# Check the size of the wheel if RUN_WHEEL_CHECK is true
|
||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||
# sync the default value with .buildkite/check-wheel-size.py
|
||||
@ -289,7 +305,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
|
||||
&& apt-get install -y software-properties-common curl sudo python3-pip \
|
||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
||||
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
|
||||
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
|
||||
@ -356,36 +372,32 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
. /etc/environment && \
|
||||
uv pip list
|
||||
|
||||
# Even when we build Flashinfer with AOT mode, there's still
|
||||
# some issues w.r.t. JIT compilation. Therefore we need to
|
||||
# install build dependencies for JIT compilation.
|
||||
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
||||
COPY requirements/build.txt requirements/build.txt
|
||||
# Install deepgemm wheel that has been built in the `build` stage
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/build.txt \
|
||||
--mount=type=bind,from=build,source=/tmp/deepgemm/dist,target=/tmp/deepgemm/dist,ro \
|
||||
sh -c 'if ls /tmp/deepgemm/dist/*.whl >/dev/null 2>&1; then \
|
||||
uv pip install --system /tmp/deepgemm/dist/*.whl; \
|
||||
else \
|
||||
echo "No DeepGEMM wheels to install; skipping."; \
|
||||
fi'
|
||||
|
||||
# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH (https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/.ci/manywheel/build_cuda.sh#L141C14-L141C36)
|
||||
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||||
|
||||
# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
|
||||
RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system ep_kernels/dist/*.whl --verbose \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
# Install DeepGEMM from source
|
||||
ARG DEEPGEMM_GIT_REF
|
||||
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
|
||||
|
||||
COPY tools/install_gdrcopy.sh install_gdrcopy.sh
|
||||
RUN set -eux; \
|
||||
RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=/tmp/install_gdrcopy.sh,ro \
|
||||
set -eux; \
|
||||
case "${TARGETPLATFORM}" in \
|
||||
linux/arm64) UUARCH="aarch64" ;; \
|
||||
linux/amd64) UUARCH="x64" ;; \
|
||||
*) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
|
||||
esac; \
|
||||
./install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"; \
|
||||
rm ./install_gdrcopy.sh
|
||||
|
||||
# Install EP kernels(pplx-kernels and DeepEP)
|
||||
COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
|
||||
ENV CUDA_HOME=/usr/local/cuda
|
||||
RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a 10.0a+PTX}" \
|
||||
&& bash install_python_libraries.sh
|
||||
/tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"
|
||||
|
||||
# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
|
||||
# return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
|
||||
@ -415,6 +427,11 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y git
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
||||
@ -455,12 +472,11 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
|
||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
|
||||
COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
|
||||
|
||||
# install additional dependencies for openai api server
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
|
||||
if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
|
||||
uv pip install --system -r requirements/kv_connectors.txt; \
|
||||
uv pip install --system -r /tmp/kv_connectors.txt; \
|
||||
fi; \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
BITSANDBYTES_VERSION="0.42.0"; \
|
||||
|
||||
@ -8,8 +8,8 @@ FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openbl
|
||||
|
||||
ARG MAX_JOBS
|
||||
ARG OPENBLAS_VERSION=0.3.30
|
||||
RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
|
||||
&& source /opt/rh/gcc-toolset-13/enable \
|
||||
RUN microdnf install -y dnf && dnf install -y gcc-toolset-14 make wget unzip \
|
||||
&& source /opt/rh/gcc-toolset-14/enable \
|
||||
&& wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
|
||||
&& unzip OpenBLAS-$OPENBLAS_VERSION.zip \
|
||||
&& cd OpenBLAS-$OPENBLAS_VERSION \
|
||||
@ -57,7 +57,7 @@ COPY --from=openblas-builder /tmp/control /dev/null
|
||||
RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
|
||||
dnf install -y openssl-devel \
|
||||
&& dnf install -y \
|
||||
git tar gcc-toolset-13 automake libtool \
|
||||
git tar gcc-toolset-14 automake libtool \
|
||||
pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
|
||||
libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
|
||||
harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
|
||||
@ -84,7 +84,7 @@ ARG _GLIBCXX_USE_CXX11_ABI=1
|
||||
ARG OPENBLAS_VERSION=0.3.30
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
source /opt/rh/gcc-toolset-14/enable && \
|
||||
git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
|
||||
cd pytorch && \
|
||||
uv pip install -r requirements.txt && \
|
||||
@ -97,7 +97,7 @@ ARG TORCHVISION_VERSION=0.22.0
|
||||
ARG TORCHVISION_USE_NVJPEG=0
|
||||
ARG TORCHVISION_USE_FFMPEG=0
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
source /opt/rh/gcc-toolset-14/enable && \
|
||||
git clone --recursive https://github.com/pytorch/vision.git -b v${TORCHVISION_VERSION} && \
|
||||
cd vision && \
|
||||
MAX_JOBS=${MAX_JOBS:-$(nproc)} \
|
||||
@ -113,7 +113,7 @@ ARG USE_ROCM=0
|
||||
ARG USE_CUDA=0
|
||||
ARG TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_FFMPEG=1
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
source /opt/rh/gcc-toolset-14/enable && \
|
||||
git clone --recursive https://github.com/pytorch/audio.git -b v${TORCHAUDIO_VERSION} && \
|
||||
cd audio && \
|
||||
MAX_JOBS=${MAX_JOBS:-$(nproc)} \
|
||||
@ -130,7 +130,7 @@ ARG MAX_JOBS
|
||||
ARG PYARROW_PARALLEL
|
||||
ARG PYARROW_VERSION=21.0.0
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
source /opt/rh/gcc-toolset-14/enable && \
|
||||
git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
|
||||
cd arrow/cpp && \
|
||||
mkdir build && cd build && \
|
||||
@ -162,7 +162,7 @@ ARG OPENCV_VERSION=86
|
||||
ARG OPENCV_PATCH=97f3f39
|
||||
ARG ENABLE_HEADLESS=1
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
source /opt/rh/gcc-toolset-14/enable && \
|
||||
git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
|
||||
cd opencv-python && \
|
||||
sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
|
||||
@ -196,7 +196,7 @@ ARG MAX_JOBS
|
||||
ARG NUMBA_VERSION=0.61.2
|
||||
|
||||
# Clone all required dependencies
|
||||
RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-13/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
|
||||
RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-14/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
|
||||
git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \
|
||||
cd ./numba && \
|
||||
if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
|
||||
@ -211,6 +211,9 @@ RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset
|
||||
|
||||
FROM base-builder AS vllmcache-builder
|
||||
|
||||
ENV LLVM_CONFIG=/usr/lib64/llvm15/bin/llvm-config
|
||||
ENV PATH=/usr/lib64/llvm15/bin:$PATH
|
||||
|
||||
COPY --from=torch-builder /tmp/control /dev/null
|
||||
COPY --from=arrow-builder /tmp/control /dev/null
|
||||
COPY --from=cv-builder /tmp/control /dev/null
|
||||
@ -225,10 +228,13 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
dnf install llvm15 llvm15-devel -y && \
|
||||
rpm -ivh --nodeps https://mirror.stream.centos.org/9-stream/CRB/ppc64le/os/Packages/protobuf-lite-devel-3.14.0-16.el9.ppc64le.rpm && \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
source /opt/rh/gcc-toolset-14/enable && \
|
||||
git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
|
||||
uv pip install maturin && \
|
||||
uv build --wheel --out-dir /hf_wheels/
|
||||
|
||||
ENV CXXFLAGS="-fno-lto -Wno-error=free-nonheap-object" \
|
||||
CFLAGS="-fno-lto"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
|
||||
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
|
||||
@ -236,7 +242,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
|
||||
--mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
|
||||
--mount=type=bind,src=.,dst=/src/,rw \
|
||||
source /opt/rh/gcc-toolset-13/enable && \
|
||||
source /opt/rh/gcc-toolset-14/enable && \
|
||||
export PATH=$PATH:/usr/lib64/llvm15/bin && \
|
||||
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl && \
|
||||
sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
|
||||
@ -260,7 +266,7 @@ FROM base-builder AS lapack-builder
|
||||
ARG MAX_JOBS
|
||||
ARG LAPACK_VERSION=3.12.1
|
||||
RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${LAPACK_VERSION} \
|
||||
&& cd lapack && source /opt/rh/gcc-toolset-13/enable \
|
||||
&& cd lapack && source /opt/rh/gcc-toolset-14/enable \
|
||||
&& cmake -B build -S . \
|
||||
&& cmake --build build -j ${MAX_JOBS:-$(nproc)}
|
||||
|
||||
@ -299,7 +305,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
|
||||
rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
|
||||
microdnf install --nodocs -y \
|
||||
libomp tar findutils openssl llvm15 llvm15-devel \
|
||||
libomp libicu tar findutils openssl llvm15 llvm15-devel \
|
||||
pkgconfig xsimd g++ gcc-fortran libsndfile \
|
||||
libtiff libjpeg openjpeg2 zlib zeromq \
|
||||
freetype lcms2 libwebp tcl tk utf8proc \
|
||||
|
||||
@ -7,6 +7,8 @@ FROM ${BASE_IMAGE} AS base
|
||||
|
||||
ARG ARG_PYTORCH_ROCM_ARCH
|
||||
ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
|
||||
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||
ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
|
||||
|
||||
# Install some basic utilities
|
||||
RUN apt-get update -q -y && apt-get install -q -y \
|
||||
@ -121,8 +123,6 @@ COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
|
||||
COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
|
||||
COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker
|
||||
|
||||
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||
ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
|
||||
ENV TOKENIZERS_PARALLELISM=false
|
||||
|
||||
# ENV that can improve safe tensor loading, and end-to-end time
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 119 KiB After Width: | Height: | Size: 131 KiB |
@ -49,9 +49,6 @@ llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
|
||||
|
||||
By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
|
||||
|
||||
!!! warning
|
||||
CUDA graph capture takes up more memory in V1 than in V0.
|
||||
|
||||
You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
|
||||
|
||||
??? code
|
||||
|
||||
@ -31,9 +31,7 @@ In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as re
|
||||
|
||||
Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
|
||||
|
||||
In vLLM V1, **chunked prefill is always enabled by default**. This is different from vLLM V0, where it was conditionally enabled based on model characteristics.
|
||||
|
||||
With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
|
||||
In V1, **chunked prefill is enabled by default whenever possible**. With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
|
||||
|
||||
This policy has two benefits:
|
||||
|
||||
|
||||
@ -133,8 +133,6 @@ We consider 3 different scenarios:
|
||||
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
|
||||
The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
|
||||
For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
|
||||
Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
|
||||
V0-only classes and code will be removed in the very near future.
|
||||
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized.
|
||||
|
||||
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
|
||||
|
||||
@ -9,7 +9,7 @@ TL;DR:
|
||||
|----------|----------|-------------|
|
||||
| --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs |
|
||||
| -O.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only |
|
||||
| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(mode=CompilationMode.NONE) | Turn off CUDAGraphs only |
|
||||
| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only |
|
||||
| -O.backend=eager | compilation_config=CompilationConfig(backend='eager') | Turn off TorchInductor |
|
||||
|
||||
## vLLM-torch.compile overview
|
||||
|
||||
@ -4,7 +4,7 @@ The community frequently requests the ability to extend vLLM with custom feature
|
||||
|
||||
## How Plugins Work in vLLM
|
||||
|
||||
Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview](arch_overview.md)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
|
||||
Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview](arch_overview.md)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_plugins_by_group][vllm.plugins.load_plugins_by_group] function in the `vllm.plugins` module.
|
||||
|
||||
## How vLLM Discovers Plugins
|
||||
|
||||
@ -57,6 +57,100 @@ Every plugin has three parts:
|
||||
|
||||
- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
|
||||
|
||||
### Platform plugins guidelines
|
||||
|
||||
1. Create a platform plugin project, for example, `vllm_add_dummy_platform`. The project structure should look like this:
|
||||
|
||||
```shell
|
||||
vllm_add_dummy_platform/
|
||||
├── vllm_add_dummy_platform/
|
||||
│ ├── __init__.py
|
||||
│ ├── my_dummy_platform.py
|
||||
│ ├── my_dummy_worker.py
|
||||
│ ├── my_dummy_attention.py
|
||||
│ ├── my_dummy_device_communicator.py
|
||||
│ ├── my_dummy_custom_ops.py
|
||||
├── setup.py
|
||||
```
|
||||
|
||||
2. In the `setup.py` file, add the following entry point:
|
||||
|
||||
```python
|
||||
setup(
|
||||
name="vllm_add_dummy_platform",
|
||||
...
|
||||
entry_points={
|
||||
"vllm.platform_plugins": [
|
||||
"my_dummy_platform = vllm_add_dummy_platform:register"
|
||||
]
|
||||
},
|
||||
...
|
||||
)
|
||||
```
|
||||
|
||||
Please make sure `vllm_add_dummy_platform:register` is a callable function and returns the platform class's fully qualified name. for example:
|
||||
|
||||
```python
|
||||
def register():
|
||||
return "vllm_add_dummy_platform.my_dummy_platform.MyDummyPlatform"
|
||||
```
|
||||
|
||||
3. Implement the platform class `MyDummyPlatform` in `my_dummy_platform.py`. The platform class should inherit from `vllm.platforms.interface.Platform`. Please follow the interface to implement the functions one by one. There are some important functions and properties that should be implemented at least:
|
||||
|
||||
- `_enum`: This property is the device enumeration from [PlatformEnum][vllm.platforms.interface.PlatformEnum]. Usually, it should be `PlatformEnum.OOT`, which means the platform is out-of-tree.
|
||||
- `device_type`: This property should return the type of the device which pytorch uses. For example, `"cpu"`, `"cuda"`, etc.
|
||||
- `device_name`: This property is set the same as `device_type` usually. It's mainly used for logging purposes.
|
||||
- `check_and_update_config`: This function is called very early in the vLLM's initialization process. It's used for plugins to update the vllm configuration. For example, the block size, graph mode config, etc, can be updated in this function. The most important thing is that the **worker_cls** should be set in this function to let vLLM know which worker class to use for the worker process.
|
||||
- `get_attn_backend_cls`: This function should return the attention backend class's fully qualified name.
|
||||
- `get_device_communicator_cls`: This function should return the device communicator class's fully qualified name.
|
||||
|
||||
4. Implement the worker class `MyDummyWorker` in `my_dummy_worker.py`. The worker class should inherit from [WorkerBase][vllm.v1.worker.worker_base.WorkerBase]. Please follow the interface to implement the functions one by one. Basically, all interfaces in the base class should be implemented, since they are called here and there in vLLM. To make sure a model can be executed, the basic functions should be implemented are:
|
||||
|
||||
- `init_device`: This function is called to set up the device for the worker.
|
||||
- `initialize_cache`: This function is called to set cache config for the worker.
|
||||
- `load_model`: This function is called to load the model weights to device.
|
||||
- `get_kv_cache_spaces`: This function is called to generate the kv cache spaces for the model.
|
||||
- `determine_available_memory`: This function is called to profiles the peak memory usage of the model to determine how much memory can be used for KV cache without OOMs.
|
||||
- `initialize_from_config`: This function is called to allocate device KV cache with the specified kv_cache_config
|
||||
- `execute_model`: This function is called every step to inference the model.
|
||||
|
||||
Additional functions that can be implemented are:
|
||||
|
||||
- If the plugin wants to support sleep mode feature, please implement the `sleep` and `wakeup` functions.
|
||||
- If the plugin wants to support graph mode feature, please implement the `compile_or_warm_up_model` function.
|
||||
- If the plugin wants to support speculative decoding feature, please implement the `take_draft_token_ids` function.
|
||||
- If the plugin wants to support lora feature, please implement the `add_lora`,`remove_lora`,`list_loras` and `pin_lora` functions.
|
||||
- If the plugin wants to support data parallelism feature, please implement the `execute_dummy_batch` functions.
|
||||
|
||||
Please look at the worker base class [WorkerBase][vllm.v1.worker.worker_base.WorkerBase] for more functions that can be implemented.
|
||||
|
||||
5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.attention.backends.abstract.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations.
|
||||
|
||||
6. Implement custom ops for high performance. Most ops can be ran by pytorch native implementation, while the performance may not be good. In this case, you can implement specific custom ops for your plugins. Currently, there are kinds of custom ops vLLM supports:
|
||||
|
||||
- pytorch ops
|
||||
there are 3 kinds of pytorch ops:
|
||||
|
||||
- `communicator ops`: Device communicator op. Such as all-reduce, all-gather, etc.
|
||||
Please implement the device communicator class `MyDummyDeviceCommunicator` in `my_dummy_device_communicator.py`. The device communicator class should inherit from [DeviceCommunicatorBase][vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase].
|
||||
- `common ops`: Common ops. Such as matmul, softmax, etc.
|
||||
Please implement the common ops by register oot way. See more detail in [CustomOp][vllm.model_executor.custom_op.CustomOp] class.
|
||||
- `csrc ops`: C++ ops. This kind of ops are implemented in C++ and are registered as torch custom ops.
|
||||
Following csrc module and `vllm._custom_ops` to implement your ops.
|
||||
|
||||
- triton ops
|
||||
Custom way doesn't work for triton ops now.
|
||||
|
||||
7. (optional) Implement other plugable modules, such as lora, graph backend, quantization, mamba attention backend, etc.
|
||||
|
||||
## Compatibility Guarantee
|
||||
|
||||
vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development.
|
||||
vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets.
|
||||
|
||||
The interface for the model/module may change during vLLM's development. If you see any deprecation log info, please upgrade your plugin to the latest version.
|
||||
|
||||
## Deprecation announcement
|
||||
|
||||
!!! warning "Deprecations"
|
||||
- `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It will be removed in v0.13.0 or v1.0.0.
|
||||
- `_Backend` in `vllm.attention` is deprecated. It will be removed in v0.13.0 or v1.0.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
|
||||
|
||||
@ -94,9 +94,6 @@ To improve privacy in shared environments, vLLM supports isolating prefix cache
|
||||
|
||||
With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
|
||||
|
||||
!!! note
|
||||
Cache isolation is not supported in engine V0.
|
||||
|
||||
## Data Structure
|
||||
|
||||
The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
|
||||
|
||||
@ -142,7 +142,7 @@ Flags: `--tool-call-parser hermes`
|
||||
Supported models:
|
||||
|
||||
* `mistralai/Mistral-7B-Instruct-v0.3` (confirmed)
|
||||
* Additional mistral function-calling models are compatible as well.
|
||||
* Additional Mistral function-calling models are compatible as well.
|
||||
|
||||
Known issues:
|
||||
|
||||
@ -158,12 +158,25 @@ Known issues:
|
||||
|
||||
Recommended flags:
|
||||
|
||||
1. To use [mistral-common](https://github.com/mistralai/mistral-common) the official Mistral tokenization backend:
|
||||
1. To use the official Mistral AI's format:
|
||||
|
||||
`--tokenizer_mode mistral --config_format mistral --load_format mistral --tool-call-parser mistral`
|
||||
`--tool-call-parser mistral`
|
||||
|
||||
2. To use the default Transformers tokenization backend:
|
||||
`--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
|
||||
2. To use the Transformers format when available:
|
||||
|
||||
`--tokenizer_mode hf --config_format hf --load_format hf --tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
|
||||
|
||||
!!! note
|
||||
Models officially released by Mistral AI have two possible formats:
|
||||
|
||||
1. The official format that is used by default with `auto` or `mistral` arguments:
|
||||
|
||||
`--tokenizer_mode mistral --config_format mistral --load_format mistral`
|
||||
This format uses [mistral-common](https://github.com/mistralai/mistral-common), the Mistral AI's tokenizer backend.
|
||||
|
||||
2. The Transformers format, when available, that is used with `hf` arguments:
|
||||
|
||||
`--tokenizer_mode hf --config_format hf --load_format hf --chat-template examples/tool_chat_template_mistral_parallel.jinja`
|
||||
|
||||
### Llama Models (`llama3_json`)
|
||||
|
||||
|
||||
@ -701,6 +701,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ |
|
||||
| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ |
|
||||
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
|
||||
| `OpenCUAForConditionalGeneration` | OpenCUA-7B | T + I<sup>E+</sup> | `xlangai/OpenCUA-7B` | ✅︎ | ✅︎ |
|
||||
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
|
||||
| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
|
||||
| `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
|
||||
|
||||
@ -118,14 +118,16 @@ The common practice is to set the tensor parallel size to the number of GPUs in
|
||||
```bash
|
||||
vllm serve /path/to/the/model/in/the/container \
|
||||
--tensor-parallel-size 8 \
|
||||
--pipeline-parallel-size 2
|
||||
--pipeline-parallel-size 2 \
|
||||
--distributed-executor-backend ray
|
||||
```
|
||||
|
||||
Alternatively, you can set `tensor_parallel_size` to the total number of GPUs in the cluster:
|
||||
|
||||
```bash
|
||||
vllm serve /path/to/the/model/in/the/container \
|
||||
--tensor-parallel-size 16
|
||||
--tensor-parallel-size 16 \
|
||||
--distributed-executor-backend ray
|
||||
```
|
||||
|
||||
## Optimizing network communication for tensor parallelism
|
||||
|
||||
@ -1,24 +1,23 @@
|
||||
# Reproducibility
|
||||
|
||||
vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. You need to do the following to achieve
|
||||
vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. To achieve
|
||||
reproducible results:
|
||||
|
||||
- For V1: Turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
|
||||
- For V0: Set the global seed (see below).
|
||||
- In offline mode, you can either set `VLLM_ENABLE_V1_MULTIPROCESSING=0` which makes scheduling deterministic,
|
||||
or enable [batch invariance](../features/batch_invariance.md) to make the outputs insensitive to scheduling.
|
||||
- In online mode, you can only enable [batch invariance](../features/batch_invariance.md).
|
||||
|
||||
Example: [examples/offline_inference/reproducibility.py](../../examples/offline_inference/reproducibility.py)
|
||||
|
||||
!!! warning
|
||||
|
||||
Applying the above settings [changes the random state in user code](#locality-of-random-state).
|
||||
Setting `VLLM_ENABLE_V1_MULTIPROCESSING=0` will change the random state of user code
|
||||
(i.e. the code that constructs [LLM][vllm.LLM] class).
|
||||
|
||||
!!! note
|
||||
|
||||
Even with the above settings, vLLM only provides reproducibility
|
||||
when it runs on the same hardware and the same vLLM version.
|
||||
Also, the online serving API (`vllm serve`) does not support reproducibility
|
||||
because it is almost impossible to make the scheduling deterministic in the
|
||||
online setting.
|
||||
|
||||
## Setting the global seed
|
||||
|
||||
@ -26,27 +25,17 @@ The `seed` parameter in vLLM is used to control the random states for various ra
|
||||
|
||||
If a specific seed value is provided, the random states for `random`, `np.random`, and `torch.manual_seed` will be set accordingly.
|
||||
|
||||
However, in some cases, setting the seed will also [change the random state in user code](#locality-of-random-state).
|
||||
|
||||
### Default Behavior
|
||||
|
||||
In V0, the `seed` parameter defaults to `None`. When the `seed` parameter is `None`, the random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that each run of vLLM will produce different results if `temperature > 0`, as expected.
|
||||
|
||||
In V1, the `seed` parameter defaults to `0` which sets the random state for each worker, so the results will remain consistent for each vLLM run even if `temperature > 0`.
|
||||
|
||||
It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs
|
||||
for workflows such as speculative decoding. For more information, see: <https://github.com/vllm-project/vllm/pull/17929>
|
||||
|
||||
!!! note
|
||||
|
||||
It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs
|
||||
for workflows such as speculative decoding.
|
||||
|
||||
For more information, see: <https://github.com/vllm-project/vllm/pull/17929>
|
||||
The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM
|
||||
only if the workers are run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
|
||||
|
||||
### Locality of random state
|
||||
|
||||
The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM under the following conditions:
|
||||
|
||||
- For V0: The seed is specified.
|
||||
- For V1: The workers are run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
|
||||
|
||||
By default, these conditions are not active so you can use vLLM without having to worry about
|
||||
accidentally making deterministic subsequent operations that rely on random state.
|
||||
By default, `VLLM_ENABLE_V1_MULTIPROCESSING=1` so you can use vLLM without having to worry about
|
||||
accidentally making deterministic subsequent operations that rely on random state.
|
||||
|
||||
@ -2,11 +2,9 @@
|
||||
|
||||
!!! announcement
|
||||
|
||||
We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
|
||||
We have fully deprecated V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
|
||||
|
||||
V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
|
||||
|
||||
## Why vLLM V1?
|
||||
If you have a use case that works on V0 Engine but not V1, please share it on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
|
||||
|
||||
vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
|
||||
|
||||
@ -32,16 +30,44 @@ Upgrade to vLLM’s Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha-
|
||||
|
||||
This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1.
|
||||
|
||||
## Current Status
|
||||
## Differences from V0
|
||||
|
||||
For each item, our progress towards V1 support falls into one of the following states:
|
||||
This section lists some differences in behavior between V0 and V1.
|
||||
|
||||
- **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
|
||||
- **🟢 Functional**: Fully operational, with ongoing optimizations.
|
||||
- **🚧 WIP**: Under active development.
|
||||
- **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).
|
||||
- **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later.
|
||||
- **🔴 Deprecated**: Not planned for V1 unless there is strong demand.
|
||||
### Chunked Prefill
|
||||
|
||||
Chunked prefill is enabled by default whenever possible, unlike in V0 where it was conditionally enabled based on model characteristics.
|
||||
|
||||
### CUDA Graphs
|
||||
|
||||
CUDA graph capture takes up more memory in V1 than in V0.
|
||||
|
||||
### Semantic Changes to Logprobs
|
||||
|
||||
#### Logprobs Calculation
|
||||
|
||||
By default, logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
|
||||
before applying any logits post-processing such as temperature scaling or penalty
|
||||
adjustments). As a result, the returned logprobs do not reflect the final adjusted
|
||||
probabilities used during sampling.
|
||||
|
||||
You can adjust this behavior by setting the `--logprobs-mode` flag.
|
||||
Four modes are supported: `raw_logprobs` (default), `processed_logprobs`, `raw_logits`, `processed_logits`.
|
||||
Raw means the values before applying any logit processors, like bad words.
|
||||
Processed means the values after applying all processors, including temperature and top_k/top_p.
|
||||
|
||||
#### Prompt Logprobs with Prefix Caching
|
||||
|
||||
While V1 supports passing prompt logprobs with prefix caching enabled, it no longer caches the logprobs.
|
||||
For a request requiring prompt logprobs, the engine will ignore the prefix cache and recompute the prefill of full prompt to generate the logprobs.
|
||||
|
||||
## Feature Support
|
||||
|
||||
For each item, its support in vLLM V1 falls into one of the following states:
|
||||
|
||||
- **🟢 Functional**: Fully operational with optimizations comparable to or better than V0.
|
||||
- **🟡 In Progress**: Planned to be in vLLM V1, with open PRs/RFCs.
|
||||
- **🔴 Removed**: Dropped from vLLM V1. Will only consider re-introducing if there is strong demand.
|
||||
|
||||
!!! note
|
||||
vLLM V1’s unified scheduler treats both prompt and output tokens the same
|
||||
@ -57,13 +83,13 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
|
||||
|
||||
### Hardware
|
||||
|
||||
| Hardware | Status |
|
||||
|------------|-----------------------------------------------|
|
||||
| **NVIDIA** | <nobr>🚀</nobr> |
|
||||
| **AMD** | <nobr>🟢</nobr> |
|
||||
| Hardware | Status |
|
||||
|------------------|-----------------------------------------------|
|
||||
| **NVIDIA** | <nobr>🟢</nobr> |
|
||||
| **AMD** | <nobr>🟢</nobr> |
|
||||
| **INTEL GPU** | <nobr>🟢</nobr> |
|
||||
| **TPU** | <nobr>🟢</nobr> |
|
||||
| **CPU** | <nobr>🟢 (x86\_64/aarch64) 🟡 (MacOS) </nobr> |
|
||||
| **TPU** | <nobr>🟢</nobr> |
|
||||
| **CPU** | <nobr>🟢</nobr> |
|
||||
|
||||
!!! note
|
||||
|
||||
@ -78,23 +104,21 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
|
||||
|
||||
### Models
|
||||
|
||||
| Model Type | Status |
|
||||
|-----------------------------|------------------------------------------------------------------------------------|
|
||||
| **Decoder-only Models** | <nobr>🚀 Optimized</nobr> |
|
||||
| **Encoder-Decoder Models** | <nobr>🟢 Whisper only</nobr> |
|
||||
| **Embedding Models** | <nobr>🟢 Functional</nobr> |
|
||||
| **Mamba Models** | <nobr>🟢 (Mamba-2), 🟢 (Mamba-1)</nobr> |
|
||||
| **Multimodal Models** | <nobr>🟢 Functional</nobr> |
|
||||
| Model Type | Status |
|
||||
|-----------------------------|-------------------------------------------------------------------------|
|
||||
| **Decoder-only Models** | <nobr>🟢</nobr> |
|
||||
| **Encoder-Decoder Models** | <nobr>🟢 (Whisper), 🔴 (Others) </nobr> |
|
||||
| **Pooling Models** | <nobr>🟢</nobr> |
|
||||
| **Mamba Models** | <nobr>🟢</nobr> |
|
||||
| **Multimodal Models** | <nobr>🟢</nobr> |
|
||||
|
||||
See below for the status of models that are not yet supported or have more features planned in V1.
|
||||
|
||||
#### Embedding Models
|
||||
#### Pooling Models
|
||||
|
||||
The initial basic support is now functional.
|
||||
Now fully supported, with prefix caching and chunked prefill newly available for last-pooling models.
|
||||
|
||||
Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249),
|
||||
which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360)
|
||||
to enable simultaneous generation and embedding using the same engine instance in V1.
|
||||
We are working on enabling prefix caching and chunked prefill for more categories of pooling models.
|
||||
|
||||
#### Mamba Models
|
||||
|
||||
@ -112,24 +136,25 @@ Please note that prefix caching is not yet supported for any of the above models
|
||||
|
||||
Whisper is supported. Other models requiring cross-attention between separate
|
||||
encoder and decoder (e.g., `BartForConditionalGeneration`,
|
||||
`MllamaForConditionalGeneration`) are not supported.
|
||||
`MllamaForConditionalGeneration`) are no longer supported.
|
||||
|
||||
### Features
|
||||
|
||||
| Feature | Status |
|
||||
|---------------------------------------------|-----------------------------------------------------------------------------------|
|
||||
| **Prefix Caching** | <nobr>🚀 Optimized</nobr> |
|
||||
| **Chunked Prefill** | <nobr>🚀 Optimized</nobr> |
|
||||
| **LoRA** | <nobr>🚀 Optimized</nobr> |
|
||||
| **Prefix Caching** | <nobr>🟢 Functional</nobr> |
|
||||
| **Chunked Prefill** | <nobr>🟢 Functional</nobr> |
|
||||
| **LoRA** | <nobr>🟢 Functional</nobr> |
|
||||
| **Logprobs Calculation** | <nobr>🟢 Functional</nobr> |
|
||||
| **FP8 KV Cache** | <nobr>🟢 Functional on Hopper devices (<https://github.com/vllm-project/vllm/pull/15191>)</nobr>|
|
||||
| **Spec Decode** | <nobr>🚀 Optimized</nobr> |
|
||||
| **Prompt Logprobs with Prefix Caching** | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
|
||||
| **FP8 KV Cache** | <nobr>🟢 Functional</nobr> |
|
||||
| **Spec Decode** | <nobr>🟢 Functional</nobr> |
|
||||
| **Prompt Logprobs with Prefix Caching** | <nobr>🟢 Functional</nobr> |
|
||||
| **Structured Output Alternative Backends** | <nobr>🟢 Functional</nobr> |
|
||||
| **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr> |
|
||||
| **best_of** | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
|
||||
| **Per-Request Logits Processors** | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
|
||||
| **GPU <> CPU KV Cache Swapping** | <nobr>🔴 Deprecated</nobr> |
|
||||
| **Concurrent Partial Prefills** | <nobr>🟡 [In Progress](https://github.com/vllm-project/vllm/issues/14003)</nobr> |
|
||||
| **best_of** | <nobr>🔴 [Removed](https://github.com/vllm-project/vllm/issues/13361)</nobr> |
|
||||
| **Per-Request Logits Processors** | <nobr>🔴 [Removed](https://github.com/vllm-project/vllm/pull/13360)</nobr> |
|
||||
| **GPU <> CPU KV Cache Swapping** | <nobr>🔴 Removed</nobr> |
|
||||
| **Request-level Structured Output Backend** | <nobr>🔴 Removed</nobr> |
|
||||
|
||||
!!! note
|
||||
|
||||
@ -139,37 +164,16 @@ encoder and decoder (e.g., `BartForConditionalGeneration`,
|
||||
prefix caching, and speculative decoding without a strict separation between prefill
|
||||
and decode phases.
|
||||
|
||||
#### Semantic Changes to Logprobs
|
||||
#### Removed Features
|
||||
|
||||
vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
|
||||
differences compared to V0:
|
||||
|
||||
##### Logprobs Calculation
|
||||
|
||||
By default, logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
|
||||
before applying any logits post-processing such as temperature scaling or penalty
|
||||
adjustments). As a result, the returned logprobs do not reflect the final adjusted
|
||||
probabilities used during sampling.
|
||||
|
||||
You can adjust this behavior by setting the `--logprobs-mode` flag.
|
||||
Four modes are supported: `raw_logprobs` (default), `processed_logprobs`, `raw_logits`, `processed_logits`.
|
||||
Raw means the values before applying any logit processors, like bad words.
|
||||
Processed means the values after applying all processors, including temperature and top_k/top_p.
|
||||
|
||||
##### Prompt Logprobs with Prefix Caching
|
||||
|
||||
Logprobs are not cached. For a request requiring prompt logprobs, the engine will ignore the prefix cache and recompute the prefill of full prompt to generate the logprobs.
|
||||
|
||||
#### Deprecated Features
|
||||
|
||||
As part of the major architectural rework in vLLM V1, several legacy features have been deprecated.
|
||||
As part of the major architectural rework in vLLM V1, several legacy features have been removed.
|
||||
|
||||
##### Sampling features
|
||||
|
||||
- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
|
||||
- **best_of**: This feature has been removed due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
|
||||
- **Per-Request Logits Processors**: In V0, users could pass custom
|
||||
processing functions to adjust logits on a per-request basis. In vLLM V1, this
|
||||
feature has been deprecated. Instead, we now support **global logits processors**
|
||||
feature has been removed. Instead, we now support **global logits processors**
|
||||
which are set at startup time, see [RFC #17799](https://github.com/vllm-project/vllm/issues/17799).
|
||||
|
||||
##### KV Cache features
|
||||
@ -179,4 +183,4 @@ to handle request preemptions.
|
||||
|
||||
##### Structured Output features
|
||||
|
||||
- **Request-level Structured Output Backend**: Deprecated, alternative backends (outlines, guidance) with fallbacks is supported now.
|
||||
- **Request-level Structured Output Backend**: Removed; alternative backends (outlines, guidance) with fallbacks are supported now.
|
||||
|
||||
@ -11,12 +11,11 @@ import random
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# V1 only: Turn off multiprocessing to make the scheduling deterministic.
|
||||
# Either:
|
||||
## Turn off multiprocessing to make the scheduling deterministic, or
|
||||
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
|
||||
|
||||
# V0 only: Set the global seed. The default seed is None, which is
|
||||
# not reproducible.
|
||||
SEED = 42
|
||||
## Enable batch invariance to get consistent results regardless of scheduling.
|
||||
os.environ["VLLM_BATCH_INVARIANT"] = "1"
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
@ -28,7 +27,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
|
||||
def main():
|
||||
llm = LLM(model="facebook/opt-125m", seed=SEED)
|
||||
llm = LLM(model="facebook/opt-125m")
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
|
||||
@ -30,8 +30,8 @@ class WorkerExtension:
|
||||
"""
|
||||
The class for vLLM's worker to inherit from.
|
||||
By defining an extension class, the code can work no matter what is
|
||||
the underlying worker class. This way, the code can be compatible
|
||||
with both vLLM V0 and V1.
|
||||
the underlying worker class.
|
||||
|
||||
NOTE: we define this class in a separate module, and the main module
|
||||
should pass the full qualified name as `worker_extension_cls` argument.
|
||||
"""
|
||||
@ -96,8 +96,8 @@ class ColocateWorkerExtension:
|
||||
"""
|
||||
The class for vLLM's worker to inherit from, in the colocate setting.
|
||||
By defining an extension class, the code can work no matter what is
|
||||
the underlying worker class. This way, the code can be compatible
|
||||
with both vLLM V0 and V1.
|
||||
the underlying worker class.
|
||||
|
||||
NOTE: we define this class in a separate module, and the main module
|
||||
should pass the full qualified name as `worker_extension_cls` argument.
|
||||
"""
|
||||
|
||||
@ -67,22 +67,9 @@ def main(args):
|
||||
Path(args.output).mkdir(exist_ok=True)
|
||||
# Dump worker states to output directory
|
||||
|
||||
# Check which engine version is being used
|
||||
is_v1_engine = hasattr(llm.llm_engine, "engine_core")
|
||||
|
||||
if is_v1_engine:
|
||||
# For V1 engine, we need to use engine_core.save_sharded_state
|
||||
print("Using V1 engine save path")
|
||||
llm.llm_engine.engine_core.save_sharded_state(
|
||||
path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
|
||||
)
|
||||
else:
|
||||
# For V0 engine
|
||||
print("Using V0 engine save path")
|
||||
model_executor = llm.llm_engine.model_executor
|
||||
model_executor.save_sharded_state(
|
||||
path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
|
||||
)
|
||||
llm.llm_engine.engine_core.save_sharded_state(
|
||||
path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
|
||||
)
|
||||
|
||||
# Copy metadata files to output directory
|
||||
for file in os.listdir(model_path):
|
||||
|
||||
@ -158,11 +158,7 @@ def main(args):
|
||||
print(f"generated text: {output.outputs[0].text}")
|
||||
print("-" * 50)
|
||||
|
||||
try:
|
||||
metrics = llm.get_metrics()
|
||||
except AssertionError:
|
||||
print("Metrics are not supported in the V0 engine.")
|
||||
return
|
||||
metrics = llm.get_metrics()
|
||||
|
||||
total_num_output_tokens = sum(
|
||||
len(output.outputs[0].token_ids) for output in outputs
|
||||
|
||||
@ -25,25 +25,17 @@ import gradio as gr
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
def format_history_to_openai(history):
|
||||
history_openai_format = [
|
||||
{"role": "system", "content": "You are a great AI assistant."}
|
||||
]
|
||||
for human, assistant in history:
|
||||
history_openai_format.append({"role": "user", "content": human})
|
||||
history_openai_format.append({"role": "assistant", "content": assistant})
|
||||
return history_openai_format
|
||||
|
||||
|
||||
def predict(message, history, client, model_name, temp, stop_token_ids):
|
||||
# Format history to OpenAI chat format
|
||||
history_openai_format = format_history_to_openai(history)
|
||||
history_openai_format.append({"role": "user", "content": message})
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a great AI assistant."},
|
||||
*history,
|
||||
{"role": "user", "content": message},
|
||||
]
|
||||
|
||||
# Send request to OpenAI API (vLLM server)
|
||||
stream = client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=history_openai_format,
|
||||
messages=messages,
|
||||
temperature=temp,
|
||||
stream=True,
|
||||
extra_body={
|
||||
|
||||
44
examples/online_serving/openai_responses_client.py
Normal file
44
examples/online_serving/openai_responses_client.py
Normal file
@ -0,0 +1,44 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Set up this example by starting a vLLM OpenAI-compatible server.
|
||||
Reasoning models can be used through the Responses API as seen here
|
||||
https://platform.openai.com/docs/api-reference/responses
|
||||
For example:
|
||||
vllm serve Qwen/Qwen3-8B --reasoning-parser qwen3
|
||||
|
||||
"""
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
input_messages = [{"role": "user", "content": "What model are you?"}]
|
||||
|
||||
|
||||
def main():
|
||||
base_url = "http://localhost:8000/v1"
|
||||
client = OpenAI(base_url=base_url, api_key="empty")
|
||||
model = "Qwen/Qwen3-8B" # get_first_model(client)
|
||||
response = client.responses.create(
|
||||
model=model,
|
||||
input=input_messages,
|
||||
)
|
||||
|
||||
for message in response.output:
|
||||
if message.type == "reasoning":
|
||||
# append reasoning message
|
||||
input_messages.append(message)
|
||||
|
||||
response_2 = client.responses.create(
|
||||
model=model,
|
||||
input=input_messages,
|
||||
)
|
||||
print(response_2.output_text)
|
||||
# I am Qwen, a large language model developed by Alibaba Cloud.
|
||||
# I am designed to assist with a wide range of tasks, including
|
||||
# answering questions, creating content, coding, and engaging in
|
||||
# conversations. I can help with various topics and provide
|
||||
# information or support in multiple languages. How can I assist you today?
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -19,12 +19,12 @@ pillow # Required for image processing
|
||||
prometheus-fastapi-instrumentator >= 7.0.0
|
||||
tiktoken >= 0.6.0 # Required for DBRX tokenizer
|
||||
lm-format-enforcer == 0.11.3
|
||||
llguidance >= 1.3.0, < 1.4.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x"
|
||||
llguidance >= 1.3.0, < 1.4.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x" or platform_machine == "ppc64le"
|
||||
outlines_core == 0.2.11
|
||||
# required for outlines backend disk cache
|
||||
diskcache == 5.6.3
|
||||
lark == 1.2.2
|
||||
xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x"
|
||||
xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
|
||||
typing_extensions >= 4.10
|
||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||
partial-json-parser # used for parsing partial JSON outputs
|
||||
|
||||
@ -39,3 +39,9 @@ mteb[bm25s]>=1.38.11, <2
|
||||
|
||||
# Required for eval tests
|
||||
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
|
||||
|
||||
# Required for multiprocessed tests that use spawn method
|
||||
multiprocess==0.70.16
|
||||
|
||||
# Plugins test
|
||||
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
|
||||
|
||||
34
setup.py
34
setup.py
@ -74,18 +74,6 @@ def is_ninja_available() -> bool:
|
||||
return which("ninja") is not None
|
||||
|
||||
|
||||
def is_url_available(url: str) -> bool:
|
||||
from urllib.request import urlopen
|
||||
|
||||
status = None
|
||||
try:
|
||||
with urlopen(url) as f:
|
||||
status = f.status
|
||||
except Exception:
|
||||
return False
|
||||
return status == 200
|
||||
|
||||
|
||||
class CMakeExtension(Extension):
|
||||
def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
|
||||
super().__init__(name, sources=[], py_limited_api=True, **kwa)
|
||||
@ -533,28 +521,6 @@ def get_nvcc_cuda_version() -> Version:
|
||||
return nvcc_cuda_version
|
||||
|
||||
|
||||
def get_gaudi_sw_version():
|
||||
"""
|
||||
Returns the driver version.
|
||||
"""
|
||||
# Enable console printing for `hl-smi` check
|
||||
output = subprocess.run(
|
||||
"hl-smi",
|
||||
shell=True,
|
||||
text=True,
|
||||
capture_output=True,
|
||||
env={"ENABLE_CONSOLE": "true"},
|
||||
)
|
||||
if output.returncode == 0 and output.stdout:
|
||||
return (
|
||||
output.stdout.split("\n")[2]
|
||||
.replace(" ", "")
|
||||
.split(":")[1][:-1]
|
||||
.split("-")[0]
|
||||
)
|
||||
return "0.0.0" # when hl-smi is not available
|
||||
|
||||
|
||||
def get_vllm_version() -> str:
|
||||
# Allow overriding the version. This is useful to build platform-specific
|
||||
# wheels (e.g. CPU, TPU) without modifying the source.
|
||||
|
||||
@ -748,6 +748,14 @@ class VllmRunner:
|
||||
# being captured which can trigger edge cases that we don't handle yet.
|
||||
kwargs["compilation_config"] = {"cudagraph_capture_sizes": [4]}
|
||||
|
||||
# Make sure we have atleast one cudagraph large enough for a single decode.
|
||||
if (speculative_config := kwargs.get("speculative_config")) and (
|
||||
num_speculative_tokens := speculative_config["num_speculative_tokens"]
|
||||
):
|
||||
kwargs["compilation_config"]["cudagraph_capture_sizes"].append(
|
||||
num_speculative_tokens + 1
|
||||
)
|
||||
|
||||
with init_ctx:
|
||||
self.llm = LLM(
|
||||
model=model_name,
|
||||
@ -845,6 +853,7 @@ class VllmRunner:
|
||||
@staticmethod
|
||||
def _final_steps_generate_w_logprobs(
|
||||
req_outputs: list[RequestOutput],
|
||||
include_prompt_token_ids: bool = False,
|
||||
) -> list[TokensTextLogprobsPromptLogprobs]:
|
||||
outputs: list[TokensTextLogprobsPromptLogprobs] = []
|
||||
for req_output in req_outputs:
|
||||
@ -853,9 +862,26 @@ class VllmRunner:
|
||||
output_str = sample.text
|
||||
output_ids = list(sample.token_ids)
|
||||
output_logprobs = sample.logprobs
|
||||
outputs.append(
|
||||
(output_ids, output_str, output_logprobs, req_output.prompt_logprobs)
|
||||
)
|
||||
if include_prompt_token_ids:
|
||||
outputs.append(
|
||||
( # type: ignore[arg-type]
|
||||
output_ids,
|
||||
output_str,
|
||||
output_logprobs,
|
||||
req_output.prompt_token_ids,
|
||||
req_output.prompt_logprobs,
|
||||
)
|
||||
)
|
||||
else:
|
||||
outputs.append(
|
||||
(
|
||||
output_ids,
|
||||
output_str,
|
||||
output_logprobs,
|
||||
req_output.prompt_logprobs,
|
||||
)
|
||||
)
|
||||
|
||||
return outputs
|
||||
|
||||
def generate_w_logprobs(
|
||||
@ -865,6 +891,7 @@ class VllmRunner:
|
||||
images: PromptImageInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
include_prompt_token_ids: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
|
||||
inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
|
||||
@ -874,7 +901,7 @@ class VllmRunner:
|
||||
)
|
||||
|
||||
toks_str_logsprobs_prompt_logprobs = self._final_steps_generate_w_logprobs(
|
||||
req_outputs
|
||||
req_outputs, include_prompt_token_ids
|
||||
)
|
||||
# Omit prompt logprobs if not required by sampling params
|
||||
return (
|
||||
|
||||
49
tests/distributed/eplb_utils.py
Normal file
49
tests/distributed/eplb_utils.py
Normal file
@ -0,0 +1,49 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import random
|
||||
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
from vllm.distributed.parallel_state import (
|
||||
init_distributed_environment,
|
||||
)
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
|
||||
def distributed_run(fn, world_size, *args):
|
||||
number_of_processes = world_size
|
||||
processes: list[mp.Process] = []
|
||||
for i in range(number_of_processes):
|
||||
env: dict[str, str] = {}
|
||||
env["RANK"] = str(i)
|
||||
env["LOCAL_RANK"] = str(i)
|
||||
env["WORLD_SIZE"] = str(number_of_processes)
|
||||
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
|
||||
env["MASTER_ADDR"] = "localhost"
|
||||
env["MASTER_PORT"] = "12345"
|
||||
p = mp.Process(target=fn, args=(env, world_size, *args))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
for p in processes:
|
||||
assert p.exitcode == 0
|
||||
|
||||
|
||||
def set_env_vars_and_device(env: dict[str, str]) -> None:
|
||||
update_environment_variables(env)
|
||||
local_rank = os.environ["LOCAL_RANK"]
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_distributed_environment()
|
||||
|
||||
# Ensure each worker process has the same random seed
|
||||
random.seed(42)
|
||||
torch.manual_seed(42)
|
||||
@ -1,57 +1,19 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import random
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
|
||||
from vllm.distributed.parallel_state import (
|
||||
ensure_model_parallel_initialized,
|
||||
get_tp_group,
|
||||
init_distributed_environment,
|
||||
)
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
|
||||
def distributed_run(fn, world_size, *args):
|
||||
number_of_processes = world_size
|
||||
processes: list[mp.Process] = []
|
||||
for i in range(number_of_processes):
|
||||
env: dict[str, str] = {}
|
||||
env["RANK"] = str(i)
|
||||
env["LOCAL_RANK"] = str(i)
|
||||
env["WORLD_SIZE"] = str(number_of_processes)
|
||||
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
|
||||
env["MASTER_ADDR"] = "localhost"
|
||||
env["MASTER_PORT"] = "12345"
|
||||
p = mp.Process(target=fn, args=(env, world_size, *args))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
for p in processes:
|
||||
assert p.exitcode == 0
|
||||
|
||||
|
||||
def set_env_vars_and_device(env: dict[str, str]) -> None:
|
||||
update_environment_variables(env)
|
||||
local_rank = os.environ["LOCAL_RANK"]
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_distributed_environment()
|
||||
|
||||
# Ensure each worker process has the same random seed
|
||||
random.seed(42)
|
||||
torch.manual_seed(42)
|
||||
from .eplb_utils import distributed_run, set_env_vars_and_device
|
||||
|
||||
|
||||
def create_expert_indices_with_redundancy(
|
||||
|
||||
285
tests/distributed/test_eplb_fused_moe_layer.py
Normal file
285
tests/distributed/test_eplb_fused_moe_layer.py
Normal file
@ -0,0 +1,285 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Test that the interaction between EPLB and FusedMoE Layer is okay
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
|
||||
from vllm.distributed.parallel_state import (
|
||||
ensure_model_parallel_initialized,
|
||||
get_tp_group,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
|
||||
|
||||
from .eplb_utils import distributed_run, set_env_vars_and_device
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestConfig:
|
||||
num_layers: int
|
||||
num_experts: int
|
||||
num_local_experts: int
|
||||
num_topk: int
|
||||
hidden_size: int
|
||||
intermediate_size: int
|
||||
weight_dtype: torch.dtype
|
||||
weight_scale_dtype: torch.dtype | None
|
||||
column_major_scales: bool
|
||||
|
||||
|
||||
def make_expert_weights(
|
||||
layer_idx: int,
|
||||
global_expert_idx: int,
|
||||
global_num_experts: int,
|
||||
tensor_shape: tuple[int, ...],
|
||||
tensor_dtype: torch.dtype,
|
||||
tensor_device: torch.device,
|
||||
is_column_major: bool,
|
||||
) -> torch.Tensor:
|
||||
assert len(tensor_shape) == 2
|
||||
|
||||
if is_column_major:
|
||||
tensor_shape = (tensor_shape[1], tensor_shape[0])
|
||||
|
||||
x = torch.empty(tensor_shape, dtype=tensor_dtype, device=tensor_device)
|
||||
value_offset = (layer_idx * global_num_experts + global_expert_idx) * x.numel()
|
||||
x.view(-1).copy_(
|
||||
torch.arange(
|
||||
value_offset,
|
||||
value_offset + x.numel(),
|
||||
dtype=tensor_dtype,
|
||||
device=tensor_device,
|
||||
)
|
||||
)
|
||||
|
||||
if is_column_major:
|
||||
x = torch.transpose(x, 1, 0)
|
||||
assert not x.is_contiguous()
|
||||
return x
|
||||
|
||||
|
||||
def make_fused_moe_layer(
|
||||
rank: int,
|
||||
layer_idx: int,
|
||||
test_config: TestConfig,
|
||||
) -> FusedMoE:
|
||||
fml = FusedMoE(
|
||||
num_experts=test_config.num_experts,
|
||||
top_k=test_config.num_topk,
|
||||
hidden_size=test_config.hidden_size,
|
||||
intermediate_size=test_config.intermediate_size,
|
||||
prefix=f"dummy_layer_{layer_idx}",
|
||||
activation="silu",
|
||||
is_act_and_mul=True,
|
||||
params_dtype=test_config.weight_dtype,
|
||||
)
|
||||
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
|
||||
from functools import partial
|
||||
|
||||
_make_expert_weights = partial(
|
||||
make_expert_weights,
|
||||
layer_idx=layer_idx,
|
||||
global_num_experts=test_config.num_experts,
|
||||
tensor_device=device,
|
||||
)
|
||||
|
||||
assert isinstance(fml.w13_weight.data, torch.Tensor)
|
||||
assert isinstance(fml.w2_weight.data, torch.Tensor)
|
||||
fml.w13_weight.data = fml.w13_weight.data.to(device=device)
|
||||
fml.w2_weight.data = fml.w2_weight.data.to(device=device)
|
||||
w13_weight = fml.w13_weight.data
|
||||
w2_weight = fml.w2_weight.data
|
||||
assert w13_weight.size(0) == test_config.num_local_experts
|
||||
for i in range(test_config.num_local_experts):
|
||||
g_i = rank * test_config.num_local_experts + i
|
||||
w13_weight_e = w13_weight[i]
|
||||
w2_weight_e = w2_weight[i]
|
||||
w13_weight_e.copy_(
|
||||
_make_expert_weights(
|
||||
global_expert_idx=g_i,
|
||||
tensor_shape=w13_weight_e.shape,
|
||||
tensor_dtype=w13_weight_e.dtype,
|
||||
is_column_major=False,
|
||||
)
|
||||
)
|
||||
w2_weight_e.copy_(
|
||||
_make_expert_weights(
|
||||
global_expert_idx=g_i,
|
||||
tensor_shape=w2_weight_e.shape,
|
||||
tensor_dtype=w2_weight_e.dtype,
|
||||
is_column_major=False,
|
||||
)
|
||||
)
|
||||
|
||||
block_size = 16
|
||||
|
||||
def block_quant_scales_shape(
|
||||
shape: tuple[int, ...], is_column_major: bool
|
||||
) -> tuple[int, ...]:
|
||||
assert len(shape) == 3
|
||||
if not is_column_major:
|
||||
return (shape[0], shape[1] // block_size, shape[2] // block_size)
|
||||
else:
|
||||
return (shape[0], shape[2] // block_size, shape[1] // block_size)
|
||||
|
||||
is_column_major = test_config.column_major_scales
|
||||
w13_weight_scale_inv = torch.empty(
|
||||
block_quant_scales_shape(w13_weight.shape, is_column_major),
|
||||
dtype=test_config.weight_dtype,
|
||||
device=device,
|
||||
)
|
||||
w2_weight_scale_inv = torch.empty(
|
||||
block_quant_scales_shape(w2_weight.shape, is_column_major),
|
||||
dtype=test_config.weight_dtype,
|
||||
device=device,
|
||||
)
|
||||
|
||||
for i in range(test_config.num_local_experts):
|
||||
g_i = rank * test_config.num_local_experts + i
|
||||
w13_s_e = w13_weight_scale_inv[i]
|
||||
w2_s_e = w2_weight_scale_inv[i]
|
||||
w13_s_e.copy_(
|
||||
_make_expert_weights(
|
||||
global_expert_idx=g_i,
|
||||
tensor_shape=w13_s_e.shape,
|
||||
tensor_dtype=w13_s_e.dtype,
|
||||
# Fill data in row-major and then
|
||||
# transpose if test_config requires col-major.
|
||||
is_column_major=False,
|
||||
)
|
||||
)
|
||||
w2_s_e.copy_(
|
||||
_make_expert_weights(
|
||||
global_expert_idx=g_i,
|
||||
tensor_shape=w2_s_e.shape,
|
||||
tensor_dtype=w2_s_e.dtype,
|
||||
is_column_major=False,
|
||||
)
|
||||
)
|
||||
if is_column_major:
|
||||
w13_weight_scale_inv = torch.transpose(w13_weight_scale_inv, 1, 2)
|
||||
w2_weight_scale_inv = torch.transpose(w2_weight_scale_inv, 1, 2)
|
||||
assert not w13_weight_scale_inv.is_contiguous()
|
||||
assert not w2_weight_scale_inv.is_contiguous()
|
||||
|
||||
# Add scales to the parameter list
|
||||
fml.w13_weight_scale_inv = torch.nn.Parameter(
|
||||
w13_weight_scale_inv, requires_grad=False
|
||||
)
|
||||
fml.w2_weight_scale_inv = torch.nn.Parameter(
|
||||
w2_weight_scale_inv, requires_grad=False
|
||||
)
|
||||
|
||||
return fml
|
||||
|
||||
|
||||
def _test_eplb_fml(env, world_size: int, test_config: TestConfig):
|
||||
# Initialize model parallel (using tensor parallel as an entrypoint
|
||||
# to expert parallel)
|
||||
set_env_vars_and_device(env)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.parallel_config.tensor_parallel_size = world_size
|
||||
vllm_config.parallel_config.enable_expert_parallel = True
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
ep_group = get_tp_group().cpu_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
|
||||
fml_layers = [
|
||||
make_fused_moe_layer(ep_rank, layer_idx, test_config)
|
||||
for layer_idx in range(test_config.num_layers)
|
||||
]
|
||||
rank_expert_weights = [fml.get_expert_weights() for fml in fml_layers]
|
||||
|
||||
indices = torch.zeros(
|
||||
test_config.num_layers, test_config.num_experts, dtype=torch.long
|
||||
)
|
||||
for lidx in range(test_config.num_layers):
|
||||
indices[lidx] = torch.Tensor(range(test_config.num_experts))
|
||||
|
||||
shuffled_indices = torch.zeros_like(indices)
|
||||
for lidx in range(test_config.num_layers):
|
||||
shuffled_indices[lidx] = torch.randperm(test_config.num_experts)
|
||||
|
||||
rearrange_expert_weights_inplace(
|
||||
indices,
|
||||
shuffled_indices,
|
||||
rank_expert_weights,
|
||||
ep_group,
|
||||
is_profile=False,
|
||||
)
|
||||
|
||||
num_local_experts = test_config.num_local_experts
|
||||
num_global_experts = test_config.num_experts
|
||||
for lidx, fml in enumerate(fml_layers):
|
||||
for name, w in fml.named_parameters():
|
||||
for e in range(num_local_experts):
|
||||
g_e = shuffled_indices[lidx][ep_rank * num_local_experts + e]
|
||||
ref = make_expert_weights(
|
||||
layer_idx=lidx,
|
||||
global_expert_idx=int(g_e.item()),
|
||||
global_num_experts=num_global_experts,
|
||||
tensor_shape=w[e].shape,
|
||||
tensor_dtype=w[e].dtype,
|
||||
tensor_device=w[e].device,
|
||||
is_column_major=not w[e].is_contiguous(),
|
||||
)
|
||||
assert w[e].shape == ref.shape and w[e].stride() == ref.stride(), (
|
||||
f"w[{e}] {w[e].size()} {w[e].stride()} vs "
|
||||
f"ref {ref.size()} {ref.stride()}"
|
||||
)
|
||||
torch.testing.assert_close(w[e], ref)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("world_size", [2])
|
||||
@pytest.mark.parametrize("num_layers", [4])
|
||||
@pytest.mark.parametrize("num_experts", [16])
|
||||
@pytest.mark.parametrize("hidden_size", [256])
|
||||
@pytest.mark.parametrize("intermediate_size", [256])
|
||||
@pytest.mark.parametrize("column_major_scales", [True, False])
|
||||
def test_eplb_fml(
|
||||
world_size: int,
|
||||
num_layers: int,
|
||||
num_experts: int,
|
||||
hidden_size: int,
|
||||
intermediate_size: int,
|
||||
column_major_scales: bool,
|
||||
):
|
||||
if torch.cuda.device_count() < world_size:
|
||||
pytest.skip(f"Need at least {world_size} GPUs to run the test")
|
||||
|
||||
num_local_experts = num_experts // world_size
|
||||
num_topk = 4
|
||||
# The dtypes are fine as we are essentially just checking data-copies
|
||||
weight_dtype = torch.bfloat16
|
||||
weight_scale_dtype = torch.bfloat16
|
||||
|
||||
test_config = TestConfig(
|
||||
num_layers=num_layers,
|
||||
num_experts=num_experts,
|
||||
num_local_experts=num_local_experts,
|
||||
num_topk=num_topk,
|
||||
hidden_size=hidden_size,
|
||||
intermediate_size=intermediate_size,
|
||||
weight_dtype=weight_dtype,
|
||||
weight_scale_dtype=weight_scale_dtype,
|
||||
column_major_scales=column_major_scales,
|
||||
)
|
||||
|
||||
distributed_run(
|
||||
_test_eplb_fml,
|
||||
world_size,
|
||||
test_config,
|
||||
)
|
||||
@ -1,9 +1,9 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import multiprocessing
|
||||
import os
|
||||
|
||||
import multiprocess as mp
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
@ -20,10 +20,12 @@ from vllm.distributed.parallel_state import (
|
||||
)
|
||||
from vllm.utils.system_utils import update_environment_variables
|
||||
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
|
||||
def distributed_run(fn, world_size):
|
||||
number_of_processes = world_size
|
||||
processes: list[multiprocessing.Process] = []
|
||||
processes: list[mp.Process] = []
|
||||
for i in range(number_of_processes):
|
||||
env: dict[str, str] = {}
|
||||
env["RANK"] = str(i)
|
||||
@ -32,7 +34,7 @@ def distributed_run(fn, world_size):
|
||||
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
|
||||
env["MASTER_ADDR"] = "localhost"
|
||||
env["MASTER_PORT"] = "12345"
|
||||
p = multiprocessing.Process(target=fn, args=(env,))
|
||||
p = mp.Process(target=fn, args=(env,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
|
||||
@ -279,7 +279,7 @@ def test_prefix_cache_default():
|
||||
args = parser.parse_args([])
|
||||
|
||||
engine_args = EngineArgs.from_cli_args(args=args)
|
||||
assert not engine_args.enable_prefix_caching, "prefix caching defaults to off."
|
||||
assert engine_args.enable_prefix_caching, "prefix caching should default to on."
|
||||
|
||||
# with flag to turn it on.
|
||||
args = parser.parse_args(["--enable-prefix-caching"])
|
||||
|
||||
71
tests/entrypoints/openai/test_response_api_simple.py
Normal file
71
tests/entrypoints/openai/test_response_api_simple.py
Normal file
@ -0,0 +1,71 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from openai import OpenAI
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-8B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
|
||||
env_dict = dict(
|
||||
VLLM_ENABLE_RESPONSES_API_STORE="1",
|
||||
# uncomment for tool calling
|
||||
# PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
|
||||
)
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_basic(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is 13 * 24?",
|
||||
)
|
||||
assert response is not None
|
||||
print("response: ", response)
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_reasoning_item(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[
|
||||
{"type": "message", "content": "Hello.", "role": "user"},
|
||||
{
|
||||
"type": "reasoning",
|
||||
"id": "lol",
|
||||
"content": [
|
||||
{
|
||||
"type": "reasoning_text",
|
||||
"text": "We need to respond: greeting.",
|
||||
}
|
||||
],
|
||||
"summary": [],
|
||||
},
|
||||
],
|
||||
temperature=0.0,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
# make sure we get a reasoning and text output
|
||||
assert response.output[0].type == "reasoning"
|
||||
assert response.output[1].type == "message"
|
||||
assert type(response.output[1].content[0].text) is str
|
||||
@ -35,7 +35,7 @@ GET_WEATHER_SCHEMA = {
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = ["--enforce-eager", "--tool-server", "demo"]
|
||||
args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
|
||||
env_dict = dict(
|
||||
VLLM_ENABLE_RESPONSES_API_STORE="1",
|
||||
PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
|
||||
@ -550,6 +550,31 @@ def call_function(name, args):
|
||||
raise ValueError(f"Unknown function: {name}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_reasoning_item(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input=[
|
||||
{"type": "message", "content": "Hello.", "role": "user"},
|
||||
{
|
||||
"type": "reasoning",
|
||||
"id": "lol",
|
||||
"content": [
|
||||
{
|
||||
"type": "reasoning_text",
|
||||
"text": "We need to respond: greeting.",
|
||||
}
|
||||
],
|
||||
"summary": [],
|
||||
},
|
||||
],
|
||||
temperature=0.0,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "completed"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_function_calling(client: OpenAI, model_name: str):
|
||||
|
||||
@ -11,6 +11,12 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
|
||||
run_mteb_embed_task,
|
||||
)
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
|
||||
|
||||
|
||||
@ -13,6 +13,12 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
|
||||
run_mteb_rerank,
|
||||
)
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
|
||||
|
||||
|
||||
@ -9,6 +9,12 @@ import torch.nn.functional as F
|
||||
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
|
||||
|
||||
@ -7,6 +7,12 @@ import pytest
|
||||
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
|
||||
|
||||
@ -9,6 +9,12 @@ import torch
|
||||
from tests.models.utils import softmax
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
|
||||
|
||||
|
||||
@ -19,6 +19,7 @@ from vllm.entrypoints.openai.protocol import (
|
||||
EmbeddingResponse,
|
||||
PoolingResponse,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.utils.serial_utils import (
|
||||
EMBED_DTYPE_TO_TORCH_DTYPE,
|
||||
@ -28,6 +29,11 @@ from vllm.utils.serial_utils import (
|
||||
decode_pooling_output,
|
||||
)
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
|
||||
DTYPE = "bfloat16"
|
||||
|
||||
@ -12,6 +12,12 @@ from tests.models.language.pooling.embed_utils import run_embedding_correctness_
|
||||
from tests.models.utils import EmbedModelInfo
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.openai.protocol import EmbeddingResponse
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODELS = [
|
||||
EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
|
||||
|
||||
@ -16,6 +16,12 @@ import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.openai.protocol import EmbeddingResponse
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
|
||||
def _generate_random_text(word_count: int) -> str:
|
||||
|
||||
@ -8,6 +8,12 @@ import torch.nn.functional as F
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "BAAI/bge-reranker-base"
|
||||
DTYPE = "bfloat16"
|
||||
|
||||
@ -10,6 +10,12 @@ from torch import tensor
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.entrypoints.openai.protocol import ScoreResponse
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODELS = [
|
||||
{"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
|
||||
|
||||
@ -7,6 +7,12 @@ import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"Encoder self-attention is not implemented on ROCm.", allow_module_level=True
|
||||
)
|
||||
|
||||
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
|
||||
max_model_len = 128
|
||||
|
||||
@ -1,7 +1,15 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
from openai.types.responses.response_reasoning_item import (
|
||||
Content,
|
||||
ResponseReasoningItem,
|
||||
Summary,
|
||||
)
|
||||
|
||||
from vllm.entrypoints.responses_utils import (
|
||||
construct_chat_message_with_tool_call,
|
||||
convert_tool_responses_to_completions_format,
|
||||
)
|
||||
|
||||
@ -28,3 +36,53 @@ class TestResponsesUtils:
|
||||
result = convert_tool_responses_to_completions_format(input_tool)
|
||||
|
||||
assert result == {"type": "function", "function": input_tool}
|
||||
|
||||
def test_construct_chat_message_with_tool_call(self):
|
||||
item = ResponseReasoningItem(
|
||||
id="lol",
|
||||
summary=[],
|
||||
type="reasoning",
|
||||
content=[
|
||||
Content(
|
||||
text="Leroy Jenkins",
|
||||
type="reasoning_text",
|
||||
)
|
||||
],
|
||||
encrypted_content=None,
|
||||
status=None,
|
||||
)
|
||||
formatted_item = construct_chat_message_with_tool_call(item)
|
||||
assert formatted_item["role"] == "assistant"
|
||||
assert formatted_item["reasoning"] == "Leroy Jenkins"
|
||||
|
||||
item = ResponseReasoningItem(
|
||||
id="lol",
|
||||
summary=[
|
||||
Summary(
|
||||
text='Hmm, the user has just started with a simple "Hello,"',
|
||||
type="summary_text",
|
||||
)
|
||||
],
|
||||
type="reasoning",
|
||||
content=None,
|
||||
encrypted_content=None,
|
||||
status=None,
|
||||
)
|
||||
|
||||
formatted_item = construct_chat_message_with_tool_call(item)
|
||||
assert formatted_item["role"] == "assistant"
|
||||
assert (
|
||||
formatted_item["reasoning"]
|
||||
== 'Hmm, the user has just started with a simple "Hello,"'
|
||||
)
|
||||
|
||||
item = ResponseReasoningItem(
|
||||
id="lol",
|
||||
summary=[],
|
||||
type="reasoning",
|
||||
content=None,
|
||||
encrypted_content="TOP_SECRET_MESSAGE",
|
||||
status=None,
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
construct_chat_message_with_tool_call(item)
|
||||
|
||||
@ -68,6 +68,7 @@ def test_copy_blocks(
|
||||
pytest.skip()
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
# Generate random block mappings where each source block is mapped to two
|
||||
# destination blocks.
|
||||
assert 2 * num_mappings <= num_blocks
|
||||
@ -152,6 +153,7 @@ def test_reshape_and_cache(
|
||||
pytest.skip()
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
# Create a random slot mapping.
|
||||
num_slots = block_size * num_blocks
|
||||
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
|
||||
@ -272,6 +274,7 @@ def test_reshape_and_cache_flash(
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
assert implementation in ["cuda", "triton"]
|
||||
if implementation == "triton" and kv_cache_layout == "HND":
|
||||
pytest.skip("Triton implementation only supports NHD layout.")
|
||||
@ -593,6 +596,7 @@ def test_concat_and_cache_mla(
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
total_slots = num_blocks * block_size
|
||||
slot_mapping_lst = random.sample(range(total_slots), num_tokens)
|
||||
@ -662,11 +666,14 @@ def test_concat_and_cache_ds_mla(
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip("concat_and_cache_mla doesn't support fp8_ds_mla on ROCm")
|
||||
if dtype.itemsize != 2:
|
||||
pytest.skip("ds_mla only supports 16-bit input")
|
||||
kv_cache_dtype = "fp8_ds_mla"
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
total_slots = num_blocks * block_size
|
||||
slot_mapping_lst = random.sample(range(total_slots), num_tokens)
|
||||
@ -779,6 +786,7 @@ def test_copy_blocks_mla(
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
entry_size = kv_lora_rank + qk_rope_head_dim
|
||||
|
||||
@ -843,6 +851,7 @@ def test_swap_blocks_mla(
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
entry_size = kv_lora_rank + qk_rope_head_dim
|
||||
|
||||
|
||||
@ -39,6 +39,11 @@ MNK_FACTORS = [
|
||||
NUM_EXPERTS = [8, 64]
|
||||
TOP_KS = [1, 2, 6]
|
||||
|
||||
DTYPES = [torch.bfloat16]
|
||||
|
||||
if not current_platform.is_fp8_fnuz():
|
||||
DTYPES.append(torch.float8_e4m3fn)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
|
||||
|
||||
@ -96,7 +101,7 @@ class BatchedMMTensors:
|
||||
@pytest.mark.parametrize("max_tokens_per_expert", [32, 224, 512])
|
||||
@pytest.mark.parametrize("K", [128, 1024])
|
||||
@pytest.mark.parametrize("N", [128, 1024])
|
||||
@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
|
||||
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
||||
def test_batched_mm(
|
||||
@ -229,7 +234,7 @@ def test_batched_mm(
|
||||
@pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
|
||||
@pytest.mark.parametrize("dtype", DTYPES)
|
||||
@pytest.mark.parametrize("per_act_token_quant", [False, True])
|
||||
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
|
||||
@pytest.mark.parametrize("input_scales", [False])
|
||||
|
||||
@ -31,6 +31,11 @@ dg_available = has_deep_gemm()
|
||||
|
||||
if current_platform.get_device_capability() < (9, 0):
|
||||
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
|
||||
if current_platform.is_fp8_fnuz():
|
||||
pytest.skip(
|
||||
"Tests in this file require float8_e4m3fn and platform does not support",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
|
||||
|
||||
@ -270,6 +270,11 @@ class Case:
|
||||
@pytest.mark.parametrize("num_token", [2])
|
||||
@pytest.mark.parametrize("tp", [1, 2, 4, 8])
|
||||
def test_equiv(num_token, a_dtype, w_dtype, tp):
|
||||
from triton_kernels.tensor_details import layout
|
||||
|
||||
if not hasattr(layout, "make_default_matmul_mxfp4_w_layout"):
|
||||
pytest.skip("make_default_matmul_mxfp4_w_layout not available")
|
||||
|
||||
M = num_token
|
||||
E = ModelConfig.num_experts
|
||||
K = ModelConfig.hidden_size
|
||||
|
||||
@ -46,6 +46,12 @@ meets_multi_gpu_requirements = pytest.mark.skipif(
|
||||
reason="Requires deep_ep or deep_gemm or pplx or flashinfer packages",
|
||||
)
|
||||
|
||||
if current_platform.is_fp8_fnuz():
|
||||
pytest.skip(
|
||||
"Tests in this file require float8_e4m3fn and platform does not support",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
|
||||
def format_result(verbose, msg, ex=None):
|
||||
if ex is not None:
|
||||
|
||||
@ -23,6 +23,12 @@ TOP_KS = [2, 6, 8]
|
||||
EP_SIZE = [1, 4, 16]
|
||||
current_platform.seed_everything(0)
|
||||
|
||||
if current_platform.is_rocm():
|
||||
pytest.skip(
|
||||
"moe_permute_unpermute_supported is not defined for ROCm",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
|
||||
def torch_permute(
|
||||
hidden_states: torch.Tensor,
|
||||
|
||||
@ -14,6 +14,12 @@ from vllm.platforms import current_platform
|
||||
from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
|
||||
from vllm.utils.math_utils import cdiv, round_up
|
||||
|
||||
if current_platform.is_fp8_fnuz():
|
||||
pytest.skip(
|
||||
"Tests in this file require float8_e4m3fn and platform does not support",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
fp8_dtype = torch.float8_e4m3fn
|
||||
|
||||
CASES = [
|
||||
|
||||
@ -19,6 +19,12 @@ if current_platform.get_device_capability() < (9, 0):
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
|
||||
if current_platform.is_fp8_fnuz():
|
||||
pytest.skip(
|
||||
"Tests in this file require float8_e4m3fn and platform does not support",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
|
||||
def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
|
||||
"""Matrix multiplication function that supports per-token input
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
@ -84,14 +86,17 @@ def test_gpt_oss_lora(gptoss20b_lora_files):
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_gpt_oss_lora_tp2(gptoss20b_lora_files):
|
||||
@pytest.mark.parametrize("fully_sharded_loras", [False, True])
|
||||
def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=2,
|
||||
max_lora_rank=8,
|
||||
max_num_seqs=16,
|
||||
tensor_parallel_size=2,
|
||||
fully_sharded_loras=fully_sharded_loras,
|
||||
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
|
||||
cudagraph_specialize_lora=False,
|
||||
),
|
||||
|
||||
@ -60,18 +60,9 @@ def llama_3p2_1b_files():
|
||||
|
||||
def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
|
||||
llm_sharded_writer = LLM(model=input_dir, **kwargs)
|
||||
# Check which engine version is being used
|
||||
is_v1_engine = hasattr(llm_sharded_writer.llm_engine, "engine_core")
|
||||
|
||||
# Dump worker states to output directory
|
||||
if is_v1_engine:
|
||||
# For V1 engine, we need to use engine_core.save_sharded_state
|
||||
print("Using V1 engine save path")
|
||||
llm_sharded_writer.llm_engine.engine_core.save_sharded_state(path=output_dir)
|
||||
else:
|
||||
# For V0 engine
|
||||
print("Using V0 engine save path")
|
||||
model_executor = llm_sharded_writer.llm_engine.model_executor
|
||||
model_executor.save_sharded_state(path=output_dir)
|
||||
llm_sharded_writer.llm_engine.engine_core.save_sharded_state(path=output_dir)
|
||||
|
||||
# Copy metadata files to output directory
|
||||
for file in os.listdir(input_dir):
|
||||
|
||||
@ -10,13 +10,6 @@ from ....utils import large_gpu_mark
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
# These have unsupported head_dim for FA. We do not
|
||||
# have a clean way to fall back, so we fail with
|
||||
# a clear msg when it happens.
|
||||
# https://github.com/vllm-project/vllm/issues/14524
|
||||
# NOTE(woosuk): Skipping these tests until V1 supports them.
|
||||
# REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
|
||||
|
||||
# This list contains the model that are using AITER kernel.
|
||||
# Skip model that are not using AITER tests.
|
||||
# When more AITER kernels are added, this list will not be
|
||||
|
||||
@ -208,7 +208,7 @@ def test_mistral_format(
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="auto",
|
||||
tokenizer_mode="hf",
|
||||
load_format="safetensors",
|
||||
config_format="hf",
|
||||
) as hf_format_model:
|
||||
|
||||
@ -50,12 +50,24 @@ def test_hf_model_weights_mapper(model_arch: str):
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
is_mistral_model = model_arch in [
|
||||
"Mistral3ForConditionalGeneration",
|
||||
"PixtralForConditionalGeneration",
|
||||
"VoxtralForConditionalGeneration",
|
||||
]
|
||||
|
||||
if not is_mistral_model or model_info.tokenizer_mode == "mistral":
|
||||
tokenizer_mode = model_info.tokenizer_mode
|
||||
else:
|
||||
tokenizer_mode = "hf"
|
||||
|
||||
model_id = model_info.default
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
tokenizer=model_info.tokenizer or model_id,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
config_format="hf",
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
|
||||
@ -259,6 +259,9 @@ def validate_generated_texts(
|
||||
tensor_parallel_size=vllm_tp_size,
|
||||
enforce_eager=False,
|
||||
default_torch_num_threads=1,
|
||||
tokenizer_mode="hf",
|
||||
load_format="hf",
|
||||
config_format="hf",
|
||||
) as llm:
|
||||
vllm_outputs = llm.generate_greedy(prompts, max_tokens)
|
||||
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
|
||||
|
||||
@ -725,6 +725,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"NemotronH_Nano_VL_V2": _HfExamplesInfo(
|
||||
"nano_vl_dummy", is_available_online=False, trust_remote_code=True
|
||||
),
|
||||
"OpenCUAForConditionalGeneration": _HfExamplesInfo(
|
||||
"xlangai/OpenCUA-7B", trust_remote_code=True
|
||||
),
|
||||
"Ovis": _HfExamplesInfo(
|
||||
"AIDC-AI/Ovis2-1B",
|
||||
trust_remote_code=True,
|
||||
|
||||
@ -128,6 +128,12 @@ CONFIGS: dict[str, ServerConfig] = {
|
||||
"arguments": [
|
||||
"--enforce-eager",
|
||||
"--no-enable-prefix-caching",
|
||||
"--tokenizer_mode",
|
||||
"hf",
|
||||
"--load_format",
|
||||
"hf",
|
||||
"--config_format",
|
||||
"hf",
|
||||
"--tool-call-parser",
|
||||
"mistral",
|
||||
"--chat-template",
|
||||
@ -140,21 +146,22 @@ CONFIGS: dict[str, ServerConfig] = {
|
||||
"without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
|
||||
"to the user's question - just respond to it normally.",
|
||||
},
|
||||
# V1 Test: Passing locally but failing in CI. This runs the
|
||||
# V0 Engine because of CPU offloading. Need to debug why.
|
||||
# FIXME: This test currently fails, need to debug why.
|
||||
# "granite20b": {
|
||||
# "model":
|
||||
# "mbayser/granite-20b-functioncalling-FP8-KV",
|
||||
# "model": "mbayser/granite-20b-functioncalling-FP8-KV",
|
||||
# "arguments": [
|
||||
# "--tool-call-parser", "granite-20b-fc", "--chat-template",
|
||||
# str(VLLM_PATH /
|
||||
# "examples/tool_chat_template_granite_20b_fc.jinja"),
|
||||
# "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
|
||||
# "--tool-call-parser",
|
||||
# "granite-20b-fc",
|
||||
# "--chat-template",
|
||||
# str(VLLM_PATH / "examples/tool_chat_template_granite_20b_fc.jinja"),
|
||||
# "--max_num_seqs",
|
||||
# "1",
|
||||
# "--enforce-eager",
|
||||
# "--cpu-offload-gb",
|
||||
# "20",
|
||||
# ],
|
||||
# "supports_parallel":
|
||||
# False,
|
||||
# "supports_rocm":
|
||||
# False,
|
||||
# "supports_parallel": False,
|
||||
# "supports_rocm": False,
|
||||
# },
|
||||
"granite-3.0-8b": {
|
||||
"model": "ibm-granite/granite-3.0-8b-instruct",
|
||||
|
||||
62
tests/transformers_utils/test_config.py
Normal file
62
tests/transformers_utils/test_config.py
Normal file
@ -0,0 +1,62 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, call, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.transformers_utils.config import list_filtered_repo_files
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"allow_patterns,expected_relative_files",
|
||||
[
|
||||
(
|
||||
["*.json", "correct*.txt"],
|
||||
["json_file.json", "subfolder/correct.txt", "correct_2.txt"],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_list_filtered_repo_files(
|
||||
allow_patterns: list[str], expected_relative_files: list[str]
|
||||
):
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
# Prep folder and files
|
||||
path_tmp_dir = Path(tmp_dir)
|
||||
subfolder = path_tmp_dir / "subfolder"
|
||||
subfolder.mkdir()
|
||||
(path_tmp_dir / "json_file.json").touch()
|
||||
(path_tmp_dir / "correct_2.txt").touch()
|
||||
(path_tmp_dir / "uncorrect.txt").touch()
|
||||
(path_tmp_dir / "uncorrect.jpeg").touch()
|
||||
(subfolder / "correct.txt").touch()
|
||||
(subfolder / "uncorrect_sub.txt").touch()
|
||||
|
||||
def _glob_path() -> list[str]:
|
||||
return [
|
||||
str(file.relative_to(path_tmp_dir))
|
||||
for file in path_tmp_dir.glob("**/*")
|
||||
if file.is_file()
|
||||
]
|
||||
|
||||
# Patch list_repo_files called by fn
|
||||
with patch(
|
||||
"vllm.transformers_utils.config.list_repo_files",
|
||||
MagicMock(return_value=_glob_path()),
|
||||
) as mock_list_repo_files:
|
||||
out_files = sorted(
|
||||
list_filtered_repo_files(
|
||||
tmp_dir, allow_patterns, "revision", "model", "token"
|
||||
)
|
||||
)
|
||||
assert out_files == sorted(expected_relative_files)
|
||||
assert mock_list_repo_files.call_count == 1
|
||||
assert mock_list_repo_files.call_args_list[0] == call(
|
||||
repo_id=tmp_dir,
|
||||
revision="revision",
|
||||
repo_type="model",
|
||||
token="token",
|
||||
)
|
||||
@ -2,7 +2,11 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
from vllm.transformers_utils.utils import is_cloud_storage, is_gcs, is_s3
|
||||
from vllm.transformers_utils.utils import (
|
||||
is_cloud_storage,
|
||||
is_gcs,
|
||||
is_s3,
|
||||
)
|
||||
|
||||
|
||||
def test_is_gcs():
|
||||
|
||||
@ -61,7 +61,7 @@ for backend in BACKENDS_TO_TEST:
|
||||
|
||||
BACKEND_BLOCK_SIZES = {}
|
||||
for backend in BACKENDS_TO_TEST:
|
||||
supported_sizes = backend.get_class().supported_kernel_block_sizes
|
||||
supported_sizes = backend.get_class().get_supported_kernel_block_sizes()
|
||||
if supported_sizes:
|
||||
default_size = supported_sizes[0]
|
||||
block_size = (
|
||||
|
||||
@ -76,11 +76,11 @@ def test_get_num_unfinished_requests():
|
||||
@pytest.mark.parametrize(
|
||||
"enable_prefix_caching, prompt_logprobs",
|
||||
[
|
||||
(None, None),
|
||||
(False, None),
|
||||
(True, 5),
|
||||
],
|
||||
)
|
||||
def test_schedule(enable_prefix_caching: bool | None, prompt_logprobs: int | None):
|
||||
def test_schedule(enable_prefix_caching: bool, prompt_logprobs: int | None):
|
||||
"""Test scheduling.
|
||||
Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
|
||||
"""
|
||||
@ -582,12 +582,12 @@ def test_check_stop_min_tokens():
|
||||
@pytest.mark.parametrize(
|
||||
"enable_prefix_caching, prompt_logprobs",
|
||||
[
|
||||
(None, None),
|
||||
(False, None),
|
||||
(True, 5),
|
||||
],
|
||||
)
|
||||
def test_schedule_concurrent_batches(
|
||||
enable_prefix_caching: bool | None, prompt_logprobs: int | None
|
||||
enable_prefix_caching: bool, prompt_logprobs: int | None
|
||||
):
|
||||
scheduler = create_scheduler(
|
||||
max_num_batched_tokens=1024,
|
||||
@ -1057,7 +1057,8 @@ def test_kv_connector_basic(is_async: bool):
|
||||
)
|
||||
|
||||
|
||||
def test_external_prefix_cache_metrics():
|
||||
@pytest.mark.parametrize("is_async", [False, True])
|
||||
def test_external_prefix_cache_metrics(is_async: bool):
|
||||
"""
|
||||
Verify connector prefix cache metrics are updated
|
||||
correctly when the scheduler processes requests with KV connector hits.
|
||||
@ -1067,7 +1068,9 @@ def test_external_prefix_cache_metrics():
|
||||
NUM_MATCHED_NEW_TOKENS = 4
|
||||
scheduler = create_scheduler(
|
||||
enable_prefix_caching=False,
|
||||
use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
|
||||
use_kv_connector=mock_kv(
|
||||
matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=is_async
|
||||
),
|
||||
)
|
||||
|
||||
# --- Prepare simple requests ---
|
||||
@ -1079,9 +1082,15 @@ def test_external_prefix_cache_metrics():
|
||||
num_tokens=NUM_TOKENS,
|
||||
max_tokens=MAX_TOKENS,
|
||||
)
|
||||
req_ids = []
|
||||
req_to_index = {}
|
||||
for i, request in enumerate(requests):
|
||||
scheduler.add_request(request)
|
||||
req_ids.append(request.request_id)
|
||||
req_to_index[request.request_id] = i
|
||||
|
||||
for req in requests:
|
||||
scheduler.add_request(req)
|
||||
if is_async:
|
||||
_step_until_kv_transfer_finished(scheduler, req_ids)
|
||||
|
||||
# --- Trigger scheduling and simulate model output ---
|
||||
output = scheduler.schedule()
|
||||
@ -1416,7 +1425,7 @@ def create_scheduler_with_priority(
|
||||
model: str = "facebook/opt-125m",
|
||||
max_num_seqs: int = 16,
|
||||
max_num_batched_tokens: int = 8192,
|
||||
enable_prefix_caching: bool | None = None,
|
||||
enable_prefix_caching: bool = False,
|
||||
long_prefill_token_threshold: int = 0,
|
||||
disable_chunked_mm_input: bool = False,
|
||||
use_kv_connector: bool = False,
|
||||
@ -1435,7 +1444,7 @@ def create_scheduler_with_priority(
|
||||
max_num_batch_tokens: max num tokens to batch
|
||||
enable_prefix_caching: optionally force APC config
|
||||
(True/False) or use default
|
||||
(None)
|
||||
(False)
|
||||
|
||||
Returns:
|
||||
{class}`Scheduler` instance with priority scheduling
|
||||
@ -1458,17 +1467,12 @@ def create_scheduler_with_priority(
|
||||
seed=42,
|
||||
)
|
||||
# Cache config, optionally force APC
|
||||
kwargs_cache = (
|
||||
{}
|
||||
if enable_prefix_caching is None
|
||||
else {"enable_prefix_caching": enable_prefix_caching}
|
||||
)
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
**kwargs_cache,
|
||||
enable_prefix_caching=enable_prefix_caching,
|
||||
)
|
||||
kv_transfer_config = (
|
||||
KVTransferConfig(
|
||||
|
||||
@ -42,7 +42,7 @@ def create_scheduler(
|
||||
model: str = "facebook/opt-125m",
|
||||
max_num_seqs: int = 16,
|
||||
max_num_batched_tokens: int = 8192,
|
||||
enable_prefix_caching: bool | None = None,
|
||||
enable_prefix_caching: bool = False,
|
||||
long_prefill_token_threshold: int = 0,
|
||||
disable_chunked_mm_input: bool = False,
|
||||
use_kv_connector: None | bool | MockKVConfig = None,
|
||||
@ -63,7 +63,7 @@ def create_scheduler(
|
||||
max_num_batch_tokens: max num tokens to batch
|
||||
enable_prefix_caching: optionally force APC config
|
||||
(True/False) or use default
|
||||
(None)
|
||||
(False)
|
||||
|
||||
Returns:
|
||||
{class}`Scheduler` instance
|
||||
@ -87,17 +87,12 @@ def create_scheduler(
|
||||
skip_tokenizer_init=skip_tokenizer_init,
|
||||
)
|
||||
# Cache config, optionally force APC
|
||||
kwargs_cache = (
|
||||
{}
|
||||
if enable_prefix_caching is None
|
||||
else {"enable_prefix_caching": enable_prefix_caching}
|
||||
)
|
||||
cache_config = CacheConfig(
|
||||
block_size=block_size,
|
||||
gpu_memory_utilization=0.9,
|
||||
swap_space=0,
|
||||
cache_dtype="auto",
|
||||
**kwargs_cache,
|
||||
enable_prefix_caching=enable_prefix_caching,
|
||||
)
|
||||
kv_transfer_config = None
|
||||
if isinstance(use_kv_connector, MockKVConfig):
|
||||
|
||||
@ -190,6 +190,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
||||
max_num_seqs=32,
|
||||
max_model_len=8192,
|
||||
dtype="bfloat16", # not everything is supported
|
||||
gpu_memory_utilization=0.9,
|
||||
)
|
||||
|
||||
# Use more realistic prompts for better token generation
|
||||
@ -444,6 +445,7 @@ def test_logprobs_without_batch_invariance_should_fail(
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
|
||||
|
||||
# CRITICAL: Disable batch invariance for this test
|
||||
monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0")
|
||||
monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False)
|
||||
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
|
||||
random.seed(seed)
|
||||
|
||||
@ -6,6 +6,7 @@ import random
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.attention.utils.fa_utils import flash_attn_supports_mla
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
skip_unsupported = pytest.mark.skipif(
|
||||
@ -18,7 +19,7 @@ BACKENDS: list[str] = [
|
||||
"FLASHINFER",
|
||||
]
|
||||
|
||||
if current_platform.is_cuda() and current_platform.is_device_capability(90):
|
||||
if flash_attn_supports_mla():
|
||||
BACKENDS.append("FLASH_ATTN_MLA")
|
||||
|
||||
DEFAULT_MODEL = "Qwen/Qwen3-1.7B"
|
||||
|
||||
@ -61,8 +61,6 @@ def test_batch_inference_correctness(
|
||||
model_setup: (method, model_name, spec_model_name, lora_path, tp_size)
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
# Disable randomness
|
||||
m.setenv("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
|
||||
torch.manual_seed(SEED)
|
||||
|
||||
@ -46,11 +46,15 @@ EAGLE_SPEC_CONFIG = {
|
||||
|
||||
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
|
||||
("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None),
|
||||
("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
|
||||
# FIXME: Since "auto" will use Mistral tokenizer and these backends do not support
|
||||
# it, we skip these tests for now.
|
||||
# ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
|
||||
# ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", None),
|
||||
("mistralai/Ministral-8B-Instruct-2410", "guidance", "hf", None),
|
||||
pytest.param(
|
||||
"mistralai/Ministral-8B-Instruct-2410",
|
||||
"lm-format-enforcer",
|
||||
"auto",
|
||||
"hf",
|
||||
None,
|
||||
marks=pytest.mark.skip(
|
||||
reason=(
|
||||
@ -80,7 +84,7 @@ PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
|
||||
# ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
|
||||
# ("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
|
||||
("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", NGRAM_SPEC_CONFIG),
|
||||
("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", NGRAM_SPEC_CONFIG),
|
||||
("mistralai/Ministral-8B-Instruct-2410", "guidance", "hf", NGRAM_SPEC_CONFIG),
|
||||
("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG),
|
||||
("meta-llama/Meta-Llama-3.1-8B-Instruct", "xgrammar", "auto", EAGLE_SPEC_CONFIG),
|
||||
]
|
||||
@ -151,6 +155,8 @@ def test_structured_output(
|
||||
),
|
||||
seed=120,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format="auto" if not model_name.startswith("mistralai/") else "hf",
|
||||
config_format="auto" if not model_name.startswith("mistralai/") else "hf",
|
||||
speculative_config=speculative_config,
|
||||
)
|
||||
|
||||
@ -720,6 +726,8 @@ def test_structured_output_auto_mode(
|
||||
max_model_len=1024,
|
||||
structured_outputs_config=dict(backend="auto"),
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format="auto",
|
||||
config_format="auto",
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
|
||||
@ -9,6 +9,12 @@
|
||||
# Assumption vs. Correctness Tests:
|
||||
# these unit tests do *not* test correctness of LMCache-side or vLLM-side logic
|
||||
# it is to ensure that assumptions LMCache makes about vLLM's interface are stable
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def assumes(obj, attr, is_callable=False, is_instance_of=None):
|
||||
import inspect
|
||||
from dataclasses import is_dataclass
|
||||
@ -48,6 +54,9 @@ def assumes(obj, attr, is_callable=False, is_instance_of=None):
|
||||
assert isinstance(attr_value, is_instance_of), assumption_msg
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm"
|
||||
)
|
||||
def test_multimodal_interface():
|
||||
# protect against interface changes
|
||||
from vllm.multimodal.inputs import PlaceholderRange
|
||||
@ -72,6 +81,9 @@ def test_multimodal_interface():
|
||||
assert token_ids.tolist() == [0, 0, 0, 0, 4, 4369, 4369, 4369, 4369, 9]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm"
|
||||
)
|
||||
def test_config_interface():
|
||||
# protect against interface changes
|
||||
from vllm.config import VllmConfig
|
||||
@ -146,6 +158,9 @@ def test_config_interface():
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm"
|
||||
)
|
||||
def test_request_interface():
|
||||
# protect against interface changes
|
||||
from types import NoneType
|
||||
|
||||
@ -20,6 +20,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.multi_connector import (
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
|
||||
NixlKVConnectorStats,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
|
||||
@ -69,6 +70,13 @@ def _compare_directories(dir1: Path, dir2: Path) -> bool:
|
||||
return True
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_rocm(),
|
||||
reason=(
|
||||
"hipErrorLaunchFailure when running this test, see issue:"
|
||||
"https://github.com/ROCm/pytorch/issues/2822"
|
||||
),
|
||||
)
|
||||
def test_multi_shared_storage_connector_consistency():
|
||||
"""
|
||||
Tests that MultiConnector with two SharedStorageConnectors saves
|
||||
|
||||
@ -12,10 +12,14 @@ from tqdm import tqdm
|
||||
from vllm import LLM, SamplingParams, TokensPrompt
|
||||
from vllm.config import KVEventsConfig, KVTransferConfig
|
||||
from vllm.distributed.kv_events import BlockStored, KVEventBatch
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.system_utils import set_env_var
|
||||
|
||||
CPU_BLOCK_SIZES = [48]
|
||||
ATTN_BACKENDS = ["FLASH_ATTN", "FLASHINFER"]
|
||||
ATTN_BACKENDS = ["FLASH_ATTN"]
|
||||
|
||||
if current_platform.is_cuda():
|
||||
ATTN_BACKENDS.append("FLASHINFER")
|
||||
|
||||
|
||||
class MockSubscriber:
|
||||
|
||||
@ -521,8 +521,8 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
|
||||
pytest.param(
|
||||
(
|
||||
"eagle",
|
||||
"meta-llama/Llama-3.1-8B-Instruct",
|
||||
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"nm-testing/Llama3_2_1B_speculator.eagle3",
|
||||
),
|
||||
marks=large_gpu_mark(min_gb=32),
|
||||
),
|
||||
@ -541,7 +541,7 @@ def test_spec_decode_logprobs(
|
||||
"""
|
||||
from vllm import LLM
|
||||
|
||||
prompt = "Hello world"
|
||||
prompt = "Hello world " * 50
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
|
||||
)
|
||||
@ -582,6 +582,9 @@ def test_spec_decode_logprobs(
|
||||
seed=42,
|
||||
logprobs_mode=logprobs_mode,
|
||||
gpu_memory_utilization=0.4,
|
||||
# Force prefill chunking
|
||||
enable_chunked_prefill=True,
|
||||
max_num_batched_tokens=32,
|
||||
)
|
||||
spec_results = spec_llm.generate([prompt], sampling_params)
|
||||
# Collect logprobs outputs from spec decode LLM.
|
||||
@ -597,6 +600,84 @@ def test_spec_decode_logprobs(
|
||||
# Per-token logprobs are expected to be the same.
|
||||
assert len(ref_logprobs) == len(spec_logprobs)
|
||||
for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs):
|
||||
assert math.isclose(ref_logprob.logprob, spec_logprob.logprob, abs_tol=1e-3)
|
||||
assert math.isclose(
|
||||
ref_logprob.logprob, spec_logprob.logprob, rel_tol=5e-2, abs_tol=1e-1
|
||||
)
|
||||
assert ref_logprob.rank == spec_logprob.rank
|
||||
assert ref_logprob.decoded_token == spec_logprob.decoded_token
|
||||
|
||||
|
||||
def test_prompt_logprobs_with_chunking_and_preemption():
|
||||
"""Test that prompt logprobs are correctly returned when using
|
||||
both chunked prefill and preemption.
|
||||
|
||||
This test ensures that the num_prompt_logprobs tracking persists
|
||||
across preemptions and prefill chunks.
|
||||
"""
|
||||
|
||||
# Create prompts that will trigger chunking and preemption
|
||||
prompts = [
|
||||
"The following numbers of the sequence "
|
||||
+ ", ".join(str(i) for i in range(10))
|
||||
+ " are:",
|
||||
"In one word, the capital of France is ",
|
||||
] + [f"Tell me about the number {i}: " for i in range(32)]
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=40,
|
||||
min_tokens=20,
|
||||
prompt_logprobs=2, # Request prompt logprobs
|
||||
)
|
||||
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen3-0.6B",
|
||||
max_model_len=512,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_batched_tokens=48, # Force prefill chunking
|
||||
num_gpu_blocks_override=32, # Force preemptions
|
||||
disable_log_stats=False,
|
||||
gpu_memory_utilization=0.25,
|
||||
) as vllm_model:
|
||||
metrics_before = vllm_model.llm.get_metrics()
|
||||
|
||||
# Generate with prompt logprobs using generate_w_logprobs which
|
||||
# returns (output_ids, output_str, output_logprobs, prompt_logprobs)
|
||||
outputs = vllm_model.generate_w_logprobs(
|
||||
prompts, sampling_params=sampling_params, include_prompt_token_ids=True
|
||||
)
|
||||
|
||||
# Verify that all outputs have prompt logprobs
|
||||
for i, output in enumerate(outputs):
|
||||
_, _, _, prompt_token_ids, prompt_logprobs = output
|
||||
assert prompt_logprobs is not None and len(prompt_logprobs) > 0, (
|
||||
f"Output {i} missing prompt logprobs"
|
||||
)
|
||||
assert len(prompt_logprobs) == len(prompt_token_ids), (
|
||||
"Unexpected number of prompt logprob positions"
|
||||
)
|
||||
|
||||
# Each position should have the requested number of logprobs
|
||||
for pos, logprobs_dict in enumerate(prompt_logprobs):
|
||||
if logprobs_dict is not None: # First token may be None
|
||||
assert (
|
||||
sampling_params.prompt_logprobs
|
||||
<= len(logprobs_dict)
|
||||
<= sampling_params.prompt_logprobs + 1
|
||||
), (
|
||||
f"Output {i} position {pos} has {len(logprobs_dict)} "
|
||||
f"logprobs, expected {sampling_params.prompt_logprobs}"
|
||||
)
|
||||
|
||||
# Check that we actually had preemptions
|
||||
metrics_after = vllm_model.llm.get_metrics()
|
||||
preemptions_before = next(
|
||||
(m.value for m in metrics_before if m.name == "vllm:num_preemptions"), 0
|
||||
)
|
||||
preemptions_after = next(
|
||||
(m.value for m in metrics_after if m.name == "vllm:num_preemptions"), 0
|
||||
)
|
||||
preemptions = preemptions_after - preemptions_before
|
||||
assert preemptions > 0, "Test did not trigger any preemptions"
|
||||
|
||||
print(f"Test passed with {preemptions} preemptions")
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
|
||||
import math
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.v1.attention.utils import (
|
||||
@ -11,9 +12,16 @@ from tests.v1.attention.utils import (
|
||||
try_get_attention_backend,
|
||||
)
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
|
||||
from vllm.config import ParallelConfig, SpeculativeConfig
|
||||
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
|
||||
if not is_flash_attn_varlen_func_available():
|
||||
pytest.skip(
|
||||
"This test requires flash_attn_varlen_func, but it's not available.",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
|
||||
class MockAttentionLayer(torch.nn.Module):
|
||||
_q_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
|
||||
|
||||
@ -185,7 +185,9 @@ def _make_mock_backend_for_kernel_block_size(
|
||||
supported_sizes: list[int | MultipleOf],
|
||||
):
|
||||
class _MockBackend:
|
||||
supported_kernel_block_sizes = supported_sizes
|
||||
@staticmethod
|
||||
def get_supported_kernel_block_sizes():
|
||||
return supported_sizes
|
||||
|
||||
return _MockBackend()
|
||||
|
||||
|
||||
@ -1,94 +1,79 @@
|
||||
#!/usr/bin/env bash
|
||||
set -ex
|
||||
|
||||
# prepare workspace directory
|
||||
WORKSPACE=$1
|
||||
if [ -z "$WORKSPACE" ]; then
|
||||
export WORKSPACE=$(pwd)/ep_kernels_workspace
|
||||
fi
|
||||
# usage: ./build.sh [workspace_dir] [mode]
|
||||
# mode: "install" (default) → install directly into current Python env
|
||||
# "wheel" → build wheels into WORKSPACE/dist
|
||||
|
||||
if [ ! -d "$WORKSPACE" ]; then
|
||||
mkdir -p $WORKSPACE
|
||||
fi
|
||||
WORKSPACE=${1:-$(pwd)/ep_kernels_workspace}
|
||||
MODE=${2:-install}
|
||||
mkdir -p "$WORKSPACE"
|
||||
|
||||
WHEEL_DIR="$WORKSPACE/dist"
|
||||
mkdir -p "$WHEEL_DIR"
|
||||
NVSHMEM_VER=3.3.9
|
||||
|
||||
pushd "$WORKSPACE"
|
||||
|
||||
# configurable pip command (default: pip3)
|
||||
PIP_CMD=${PIP_CMD:-pip3}
|
||||
CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
|
||||
|
||||
# install dependencies if not installed
|
||||
$PIP_CMD install cmake torch ninja
|
||||
|
||||
# build nvshmem
|
||||
pushd $WORKSPACE
|
||||
mkdir -p nvshmem_src
|
||||
wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
|
||||
tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1
|
||||
pushd nvshmem_src
|
||||
wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
|
||||
git init
|
||||
git apply -vvv nvshmem.patch
|
||||
|
||||
# assume CUDA_HOME is set correctly
|
||||
if [ -z "$CUDA_HOME" ]; then
|
||||
echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
|
||||
exit 1
|
||||
if [ -z "$VIRTUAL_ENV" ]; then
|
||||
uv pip install --system cmake torch ninja
|
||||
else
|
||||
uv pip install cmake torch ninja
|
||||
fi
|
||||
|
||||
# assume TORCH_CUDA_ARCH_LIST is set correctly
|
||||
if [ -z "$TORCH_CUDA_ARCH_LIST" ]; then
|
||||
echo "TORCH_CUDA_ARCH_LIST is not set, please set it to your desired architecture."
|
||||
# fetch nvshmem
|
||||
ARCH=$(uname -m)
|
||||
case "${ARCH,,}" in
|
||||
x86_64|amd64)
|
||||
NVSHMEM_SUBDIR="linux-x86_64"
|
||||
NVSHMEM_FILE="libnvshmem-linux-x86_64-${NVSHMEM_VER}_cuda12-archive.tar.xz"
|
||||
;;
|
||||
aarch64|arm64)
|
||||
NVSHMEM_SUBDIR="linux-sbsa"
|
||||
NVSHMEM_FILE="libnvshmem-linux-sbsa-${NVSHMEM_VER}_cuda12-archive.tar.xz"
|
||||
;;
|
||||
*)
|
||||
echo "Unsupported architecture: ${ARCH}" >&2
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
|
||||
# disable all features except IBGDA
|
||||
export NVSHMEM_IBGDA_SUPPORT=1
|
||||
|
||||
export NVSHMEM_SHMEM_SUPPORT=0
|
||||
export NVSHMEM_UCX_SUPPORT=0
|
||||
export NVSHMEM_USE_NCCL=0
|
||||
export NVSHMEM_PMIX_SUPPORT=0
|
||||
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
|
||||
export NVSHMEM_USE_GDRCOPY=0
|
||||
export NVSHMEM_IBRC_SUPPORT=0
|
||||
export NVSHMEM_BUILD_TESTS=0
|
||||
export NVSHMEM_BUILD_EXAMPLES=0
|
||||
export NVSHMEM_MPI_SUPPORT=0
|
||||
export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
|
||||
export NVSHMEM_BUILD_TXZ_PACKAGE=0
|
||||
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
|
||||
|
||||
cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
|
||||
cmake --build $WORKSPACE/nvshmem_build/ --target install
|
||||
NVSHMEM_URL="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/${NVSHMEM_SUBDIR}/${NVSHMEM_FILE}"
|
||||
|
||||
pushd "$WORKSPACE"
|
||||
echo "Downloading NVSHMEM ${NVSHMEM_VER} for ${NVSHMEM_SUBDIR} ..."
|
||||
curl -fSL "${NVSHMEM_URL}" -o "${NVSHMEM_FILE}"
|
||||
tar -xf "${NVSHMEM_FILE}"
|
||||
mv "${NVSHMEM_FILE%.tar.xz}" nvshmem
|
||||
rm -f "${NVSHMEM_FILE}"
|
||||
rm -rf nvshmem/lib/bin nvshmem/lib/share
|
||||
popd
|
||||
|
||||
export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
|
||||
export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem/lib/cmake:$CMAKE_PREFIX_PATH
|
||||
|
||||
is_git_dirty() {
|
||||
local dir=$1
|
||||
pushd "$dir" > /dev/null
|
||||
|
||||
if [ -d ".git" ] && [ -n "$(git status --porcelain 2>/dev/null)" ]; then
|
||||
if [ -d ".git" ] && [ -n "$(git status --porcelain 3>/dev/null)" ]; then
|
||||
popd > /dev/null
|
||||
return 0 # dirty (true)
|
||||
return 0
|
||||
else
|
||||
popd > /dev/null
|
||||
return 1 # clean (false)
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to handle git repository cloning with dirty/incomplete checks
|
||||
clone_repo() {
|
||||
local repo_url=$1
|
||||
local dir_name=$2
|
||||
local key_file=$3
|
||||
local commit_hash=$4
|
||||
|
||||
if [ -d "$dir_name" ]; then
|
||||
# Check if directory has uncommitted changes (dirty)
|
||||
if is_git_dirty "$dir_name"; then
|
||||
echo "$dir_name directory is dirty, skipping clone"
|
||||
# Check if clone failed (directory exists but not a valid git repo or missing key files)
|
||||
elif [ ! -d "$dir_name/.git" ] || [ ! -f "$dir_name/$key_file" ]; then
|
||||
echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
|
||||
rm -rf "$dir_name"
|
||||
@ -99,7 +84,7 @@ clone_repo() {
|
||||
cd ..
|
||||
fi
|
||||
else
|
||||
echo "$dir_name directory exists and appears complete; manually update if needed"
|
||||
echo "$dir_name directory exists and appears complete"
|
||||
fi
|
||||
else
|
||||
git clone "$repo_url"
|
||||
@ -111,17 +96,55 @@ clone_repo() {
|
||||
fi
|
||||
}
|
||||
|
||||
# build and install pplx, require pytorch installed
|
||||
pushd $WORKSPACE
|
||||
clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf"
|
||||
cd pplx-kernels
|
||||
$PIP_CMD install --no-build-isolation -vvv -e .
|
||||
popd
|
||||
deepep_cuda13_patch() {
|
||||
cuda_version_major=$(${CUDA_HOME}/bin/nvcc --version | egrep -o "release [0-9]+" | cut -d ' ' -f 2)
|
||||
if [ ${cuda_version_major} -ge 13 ]; then
|
||||
sed -i "s|f'{nvshmem_dir}/include']|f'{nvshmem_dir}/include', '${CUDA_HOME}/include/cccl']|" "setup.py"
|
||||
fi
|
||||
}
|
||||
|
||||
# build and install deepep, require pytorch installed
|
||||
pushd $WORKSPACE
|
||||
clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "73b6ea4"
|
||||
cd DeepEP
|
||||
export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
|
||||
$PIP_CMD install --no-build-isolation -vvv -e .
|
||||
popd
|
||||
do_build() {
|
||||
local repo=$1
|
||||
local name=$2
|
||||
local key=$3
|
||||
local commit=$4
|
||||
local extra_env=$5
|
||||
|
||||
pushd "$WORKSPACE"
|
||||
clone_repo "$repo" "$name" "$key" "$commit"
|
||||
cd "$name"
|
||||
|
||||
if [ "$name" == "DeepEP" ]; then
|
||||
deepep_cuda13_patch
|
||||
fi
|
||||
|
||||
if [ "$MODE" = "install" ]; then
|
||||
echo "Installing $name into environment"
|
||||
eval "$extra_env" uv pip install --no-build-isolation -vvv .
|
||||
else
|
||||
echo "Building $name wheel into $WHEEL_DIR"
|
||||
eval "$extra_env" uv build --wheel --no-build-isolation -vvv --out-dir "$WHEEL_DIR" .
|
||||
fi
|
||||
popd
|
||||
}
|
||||
|
||||
# build pplx-kernels
|
||||
do_build \
|
||||
"https://github.com/ppl-ai/pplx-kernels" \
|
||||
"pplx-kernels" \
|
||||
"setup.py" \
|
||||
"12cecfd" \
|
||||
""
|
||||
|
||||
# build DeepEP
|
||||
do_build \
|
||||
"https://github.com/deepseek-ai/DeepEP" \
|
||||
"DeepEP" \
|
||||
"setup.py" \
|
||||
"73b6ea4" \
|
||||
"export NVSHMEM_DIR=$WORKSPACE/nvshmem; "
|
||||
|
||||
if [ "$MODE" = "wheel" ]; then
|
||||
echo "All wheels written to $WHEEL_DIR"
|
||||
ls -l "$WHEEL_DIR"
|
||||
fi
|
||||
|
||||
@ -1,12 +1,13 @@
|
||||
#!/bin/bash
|
||||
# Script to install DeepGEMM from source
|
||||
# This script can be used both in Docker builds and by users locally
|
||||
|
||||
# Script to build and/or install DeepGEMM from source
|
||||
# Default: build and install immediately
|
||||
# Optional: build wheels to a directory for later installation (useful in multi-stage builds)
|
||||
set -e
|
||||
|
||||
# Default values
|
||||
DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
|
||||
DEEPGEMM_GIT_REF="594953acce41793ae00a1233eb516044d604bcb6"
|
||||
WHEEL_DIR=""
|
||||
|
||||
# Parse command line arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
@ -27,11 +28,20 @@ while [[ $# -gt 0 ]]; do
|
||||
CUDA_VERSION="$2"
|
||||
shift 2
|
||||
;;
|
||||
--wheel-dir)
|
||||
if [[ -z "$2" || "$2" =~ ^- ]]; then
|
||||
echo "Error: --wheel-dir requires a directory path." >&2
|
||||
exit 1
|
||||
fi
|
||||
WHEEL_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo "Options:"
|
||||
echo " --ref REF Git reference to checkout (default: $DEEPGEMM_GIT_REF)"
|
||||
echo " --cuda-version VER CUDA version (auto-detected if not provided)"
|
||||
echo " --wheel-dir PATH If set, build wheel into PATH but do not install"
|
||||
echo " -h, --help Show this help message"
|
||||
exit 0
|
||||
;;
|
||||
@ -57,16 +67,15 @@ fi
|
||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"
|
||||
CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
|
||||
CUDA_MINOR="${CUDA_MINOR%%.*}"
|
||||
|
||||
echo "CUDA version: $CUDA_VERSION (major: $CUDA_MAJOR, minor: $CUDA_MINOR)"
|
||||
|
||||
# Check CUDA version requirement
|
||||
if [ "$CUDA_MAJOR" -lt 12 ] || { [ "$CUDA_MAJOR" -eq 12 ] && [ "$CUDA_MINOR" -lt 8 ]; }; then
|
||||
echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
|
||||
echo "Skipping DeepGEMM build/installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Installing DeepGEMM from source..."
|
||||
echo "Preparing DeepGEMM build..."
|
||||
echo "Repository: $DEEPGEMM_GIT_REPO"
|
||||
echo "Reference: $DEEPGEMM_GIT_REF"
|
||||
|
||||
@ -76,23 +85,31 @@ trap 'rm -rf "$INSTALL_DIR"' EXIT
|
||||
|
||||
# Clone the repository
|
||||
git clone --recursive --shallow-submodules "$DEEPGEMM_GIT_REPO" "$INSTALL_DIR/deepgemm"
|
||||
|
||||
echo "🏗️ Building DeepGEMM"
|
||||
pushd "$INSTALL_DIR/deepgemm"
|
||||
|
||||
# Checkout the specific reference
|
||||
git checkout "$DEEPGEMM_GIT_REF"
|
||||
|
||||
# Build DeepGEMM
|
||||
# Clean previous build artifacts
|
||||
# (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
|
||||
rm -rf build dist
|
||||
rm -rf *.egg-info
|
||||
rm -rf build dist *.egg-info
|
||||
|
||||
# Build wheel
|
||||
echo "🏗️ Building DeepGEMM wheel..."
|
||||
python3 setup.py bdist_wheel
|
||||
|
||||
# Install the wheel
|
||||
# If --wheel-dir was specified, copy wheels there and exit
|
||||
if [ -n "$WHEEL_DIR" ]; then
|
||||
mkdir -p "$WHEEL_DIR"
|
||||
cp dist/*.whl "$WHEEL_DIR"/
|
||||
echo "✅ Wheel built and copied to $WHEEL_DIR"
|
||||
popd
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Default behaviour: install built wheel
|
||||
if command -v uv >/dev/null 2>&1; then
|
||||
echo "Installing DeepGEMM wheel using uv..."
|
||||
# Use --system in Docker contexts, respect user's environment otherwise
|
||||
if [ -n "$VLLM_DOCKER_BUILD_CONTEXT" ]; then
|
||||
uv pip install --system dist/*.whl
|
||||
else
|
||||
@ -104,5 +121,4 @@ else
|
||||
fi
|
||||
|
||||
popd
|
||||
|
||||
echo "✅ DeepGEMM installation completed successfully"
|
||||
|
||||
@ -948,6 +948,31 @@ class rocm_aiter_ops:
|
||||
(8192, 32768),
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def is_triton_gemm_afp4wfp4_presh_ws_tuned(n: int, k: int) -> bool:
|
||||
return (n, k) in [
|
||||
(8192, 4096),
|
||||
(1280, 8192),
|
||||
(16384, 53248),
|
||||
(106496, 16384),
|
||||
(57344, 8192),
|
||||
(8192, 2048),
|
||||
(2560, 8192),
|
||||
(10240, 8192),
|
||||
(16384, 16384),
|
||||
(8192, 28672),
|
||||
(28672, 8192),
|
||||
(18432, 16384),
|
||||
(8192, 1024),
|
||||
(7168, 8192),
|
||||
(5120, 8192),
|
||||
(8192, 8192),
|
||||
(8192, 7168),
|
||||
(14336, 8192),
|
||||
(8192, 14336),
|
||||
(8192, 3584),
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def shuffle_weight(
|
||||
self, tensor: torch.Tensor, layout: tuple[int, int] = (16, 16)
|
||||
|
||||
@ -46,9 +46,12 @@ class AttentionBackend(ABC):
|
||||
# makes sure the output tensor is allocated inside the cudagraph.
|
||||
accept_output_buffer: bool = False
|
||||
supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
|
||||
supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(1)]
|
||||
supported_kv_cache_dtypes: ClassVar[list["CacheDType"]] = ["auto"]
|
||||
|
||||
@staticmethod
|
||||
def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
|
||||
return [MultipleOf(1)]
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def get_name() -> str:
|
||||
@ -142,10 +145,11 @@ class AttentionBackend(ABC):
|
||||
if block_size not in valid_sizes:
|
||||
return False
|
||||
|
||||
if not cls.supported_kernel_block_sizes:
|
||||
supported_kernel_block_sizes = cls.get_supported_kernel_block_sizes()
|
||||
if not supported_kernel_block_sizes:
|
||||
return True
|
||||
|
||||
for supported_size in cls.supported_kernel_block_sizes:
|
||||
for supported_size in supported_kernel_block_sizes:
|
||||
if isinstance(supported_size, MultipleOf):
|
||||
supported_size = supported_size.base
|
||||
# With hybrid_blocks feature, the framework-level block size
|
||||
|
||||
@ -116,7 +116,8 @@ class VllmSerializableFunction(SerializableCallable):
|
||||
the AOT compiled path.
|
||||
"""
|
||||
compile_inputs = [
|
||||
inp or example_inputs[i] for i, inp in enumerate(fn.example_inputs)
|
||||
inp if inp is not None else example_inputs[i]
|
||||
for i, inp in enumerate(fn.example_inputs)
|
||||
]
|
||||
with tracing(TracingContext(fake_mode)):
|
||||
fn.optimized_call = vllm_backend(
|
||||
|
||||
@ -73,8 +73,8 @@ class CacheConfig:
|
||||
sliding_window: int | None = None
|
||||
"""Sliding window size for the KV cache. This is primarily set in
|
||||
`ModelConfig` and that value should be manually duplicated here."""
|
||||
enable_prefix_caching: bool | None = None
|
||||
"""Whether to enable prefix caching. Enabled by default for V1."""
|
||||
enable_prefix_caching: bool = True
|
||||
"""Whether to enable prefix caching."""
|
||||
prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
|
||||
"""Set the hash algorithm for prefix caching:\n
|
||||
- "sha256" uses Pickle for object serialization before hashing.\n
|
||||
|
||||
@ -950,14 +950,18 @@ class CompilationConfig:
|
||||
)
|
||||
)
|
||||
|
||||
if len(rounded_sizes) == 0 and multiple_of <= self.max_cudagraph_capture_size:
|
||||
# if one valid but would be round_down use that
|
||||
rounded_sizes = [multiple_of]
|
||||
|
||||
if len(rounded_sizes) == 0:
|
||||
logger.warning(
|
||||
"No valid cudagraph sizes after rounding to multiple of "
|
||||
" num_speculative_tokens + 1 (%d); please adjust num_speculative_tokens"
|
||||
" or max_cudagraph_capture_size (or cudagraph_capture_sizes)",
|
||||
multiple_of,
|
||||
raise ValueError(
|
||||
f"No valid cudagraph sizes after rounding to multiple of {multiple_of} "
|
||||
f"(num_speculative_tokens + 1 or tp if sequence parallelism is enabled)"
|
||||
f" please adjust num_speculative_tokens ({uniform_decode_query_len - 1}"
|
||||
f") or max_cudagraph_capture_size ({self.max_cudagraph_capture_size})"
|
||||
f" or cudagraph_capture_sizes ({self.cudagraph_capture_sizes})"
|
||||
)
|
||||
return
|
||||
|
||||
self.max_cudagraph_capture_size = rounded_sizes[-1]
|
||||
self.cudagraph_capture_sizes = rounded_sizes
|
||||
|
||||
@ -81,7 +81,7 @@ TaskOption = Literal[
|
||||
"transcription",
|
||||
"draft",
|
||||
]
|
||||
TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
|
||||
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "custom"]
|
||||
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
|
||||
LogprobsMode = Literal[
|
||||
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
|
||||
@ -130,7 +130,8 @@ class ModelConfig:
|
||||
name or path will be used."""
|
||||
tokenizer_mode: TokenizerMode = "auto"
|
||||
"""Tokenizer mode:\n
|
||||
- "auto" will use the fast tokenizer if available.\n
|
||||
- "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n
|
||||
- "hf" will use the fast tokenizer if available.\n
|
||||
- "slow" will always use the slow tokenizer.\n
|
||||
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
||||
- "custom" will use --tokenizer to select the preregistered tokenizer."""
|
||||
@ -146,9 +147,12 @@ class ModelConfig:
|
||||
- "bfloat16" for a balance between precision and range.\n
|
||||
- "float" is shorthand for FP32 precision.\n
|
||||
- "float32" for FP32 precision."""
|
||||
seed: int | None = None
|
||||
"""Random seed for reproducibility. Initialized to None in V0, but
|
||||
initialized to 0 in V1."""
|
||||
seed: int = 0
|
||||
"""Random seed for reproducibility.
|
||||
|
||||
We must set the global seed because otherwise,
|
||||
different tensor parallel workers would sample different tokens,
|
||||
leading to inconsistent results."""
|
||||
hf_config: PretrainedConfig = field(init=False)
|
||||
"""The Hugging Face config of the model."""
|
||||
hf_text_config: PretrainedConfig = field(init=False)
|
||||
@ -238,8 +242,8 @@ class ModelConfig:
|
||||
first one."""
|
||||
config_format: str | ConfigFormat = "auto"
|
||||
"""The format of the model config to load:\n
|
||||
- "auto" will try to load the config in hf format if available else it
|
||||
will try to load in mistral format.\n
|
||||
- "auto" will try to load the config in hf format if available after trying
|
||||
to load in mistral format.\n
|
||||
- "hf" will load the config in hf format.\n
|
||||
- "mistral" will load the config in mistral format."""
|
||||
hf_token: bool | str | None = None
|
||||
@ -415,7 +419,7 @@ class ModelConfig:
|
||||
def __post_init__(
|
||||
self,
|
||||
# Multimodal config init vars
|
||||
limit_mm_per_prompt: dict[str, int] | None,
|
||||
limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
|
||||
enable_mm_embeds: bool | None,
|
||||
media_io_kwargs: dict[str, dict[str, Any]] | None,
|
||||
mm_processor_kwargs: dict[str, Any] | None,
|
||||
@ -428,23 +432,6 @@ class ModelConfig:
|
||||
skip_mm_profiling: bool | None,
|
||||
video_pruning_rate: float | None,
|
||||
) -> None:
|
||||
# Set the default seed to 0 in V1.
|
||||
# NOTE(woosuk): In V1, we use separate processes for workers (unless
|
||||
# VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
|
||||
# doesn't affect the user process. However, without a consistent seed,
|
||||
# different tensor parallel workers would sample different tokens,
|
||||
# leading to inconsistent results.
|
||||
if self.seed is None:
|
||||
self.seed = 0
|
||||
if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
|
||||
logger.warning(
|
||||
"The global random seed is set to %d. Since "
|
||||
"VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
|
||||
"affect the random state of the Python process that "
|
||||
"launched vLLM.",
|
||||
self.seed,
|
||||
)
|
||||
|
||||
# Keep set served_model_name before maybe_model_redirect(self.model)
|
||||
self.served_model_name = get_served_model_name(
|
||||
self.model, self.served_model_name
|
||||
@ -1151,12 +1138,6 @@ class ModelConfig:
|
||||
self,
|
||||
parallel_config: ParallelConfig,
|
||||
) -> None:
|
||||
if parallel_config.distributed_executor_backend == "external_launcher":
|
||||
assert self.seed is not None, (
|
||||
"Seed must be set when using external launcher backend to "
|
||||
"make sure sampling results are the same across workers."
|
||||
)
|
||||
|
||||
total_num_attention_heads = getattr(
|
||||
self.hf_text_config, "num_attention_heads", 0
|
||||
)
|
||||
|
||||
@ -9,6 +9,7 @@ from pydantic import Field, SkipValidation, model_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
from typing_extensions import Self
|
||||
|
||||
from vllm.config.model import ModelConfig
|
||||
from vllm.config.parallel import ParallelConfig
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
@ -18,10 +19,8 @@ if TYPE_CHECKING:
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
import vllm.model_executor.layers.quantization as me_quant
|
||||
from vllm.config import ModelConfig
|
||||
else:
|
||||
PretrainedConfig = Any
|
||||
ModelConfig = Any
|
||||
|
||||
me_quant = LazyLoader(
|
||||
"model_executor", globals(), "vllm.model_executor.layers.quantization"
|
||||
@ -316,10 +315,6 @@ class SpeculativeConfig:
|
||||
self.prompt_lookup_min = 0
|
||||
|
||||
if self.model is not None:
|
||||
# TODO: Move this import to the top once `ModelConfig`
|
||||
# lives in `vllm.config.model`.
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
self.draft_model_config = ModelConfig(
|
||||
model=self.model,
|
||||
runner="draft",
|
||||
|
||||
@ -310,7 +310,6 @@ class LMCacheMPWorkerAdapter:
|
||||
request_id,
|
||||
result,
|
||||
)
|
||||
logger.info("Retrieve request for request_id=%s finished", request_id)
|
||||
|
||||
# Remove the finished requests from the tracking dicts
|
||||
for request_id in finished_stores:
|
||||
|
||||
@ -469,9 +469,6 @@ class LMCacheMPConnector(KVConnectorBase_V1):
|
||||
ops.append(meta.op)
|
||||
|
||||
if len(request_ids) > 0:
|
||||
logger.info(
|
||||
"HERE! SUBMITTING THE BATCHED RETRIEVE REQUESTS %s", request_ids
|
||||
)
|
||||
self.worker_adapter.batched_submit_retrieve_requests(
|
||||
request_ids, ops, event
|
||||
)
|
||||
|
||||
@ -1042,10 +1042,12 @@ class NixlConnectorWorker:
|
||||
NOT directly supported by NIXL (e.g., tpu)
|
||||
"""
|
||||
xfer_buffers: dict[str, torch.Tensor] = {}
|
||||
inv_order = [0, 1, 3, 2, 4]
|
||||
try:
|
||||
for layer_name, kv_cache in kv_caches.items():
|
||||
kv_shape = kv_cache.shape
|
||||
kv_dtype = kv_cache.dtype
|
||||
permute_shape = False
|
||||
if (
|
||||
self.kv_cache_layout == "NHD"
|
||||
and self.vllm_config.kv_transfer_config is not None
|
||||
@ -1059,10 +1061,20 @@ class NixlConnectorWorker:
|
||||
# Since NHD will not support Decode/Prefill TP_ratio > 1,
|
||||
# we can leverage host_buffer for permute
|
||||
self.host_buffer_kv_cache_layout = "HND"
|
||||
kv_shape = tuple(kv_shape[i] for i in [0, 1, 3, 2, 4])
|
||||
kv_shape = (
|
||||
tuple(kv_shape[i] for i in inv_order)
|
||||
if not self.use_mla
|
||||
else kv_shape
|
||||
)
|
||||
permute_shape = not self.use_mla
|
||||
|
||||
xfer_buffers[layer_name] = torch.empty(
|
||||
kv_shape, dtype=kv_dtype, device="cpu"
|
||||
)
|
||||
if permute_shape:
|
||||
xfer_buffers[layer_name] = xfer_buffers[layer_name].permute(
|
||||
inv_order
|
||||
)
|
||||
except MemoryError as e:
|
||||
logger.error("NIXLConnectorWorker gets %s.", e)
|
||||
raise
|
||||
|
||||
@ -367,7 +367,7 @@ class EngineArgs:
|
||||
config_format: str = ModelConfig.config_format
|
||||
dtype: ModelDType = ModelConfig.dtype
|
||||
kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
|
||||
seed: int | None = ModelConfig.seed
|
||||
seed: int | None = 0
|
||||
max_model_len: int | None = ModelConfig.max_model_len
|
||||
cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes
|
||||
cudagraph_capture_sizes: list[int] | None = (
|
||||
@ -425,7 +425,7 @@ class EngineArgs:
|
||||
ParallelConfig.max_parallel_loading_workers
|
||||
)
|
||||
block_size: BlockSize | None = CacheConfig.block_size
|
||||
enable_prefix_caching: bool | None = CacheConfig.enable_prefix_caching
|
||||
enable_prefix_caching: bool | None = None
|
||||
prefix_caching_hash_algo: PrefixCachingHashAlgo = (
|
||||
CacheConfig.prefix_caching_hash_algo
|
||||
)
|
||||
@ -1188,29 +1188,52 @@ class EngineArgs:
|
||||
if check_gguf_file(self.model):
|
||||
self.quantization = self.load_format = "gguf"
|
||||
|
||||
# NOTE(woosuk): In V1, we use separate processes for workers (unless
|
||||
# VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
|
||||
# doesn't affect the user process.
|
||||
if self.seed is None:
|
||||
logger.warning_once(
|
||||
"`seed=None` is equivalent to `seed=0` in V1 Engine. "
|
||||
"You will no longer be allowed to pass `None` in v0.13.",
|
||||
scope="local",
|
||||
)
|
||||
|
||||
self.seed = 0
|
||||
if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
|
||||
logger.warning(
|
||||
"The global random seed is set to %d. Since "
|
||||
"VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
|
||||
"affect the random state of the Python process that "
|
||||
"launched vLLM.",
|
||||
self.seed,
|
||||
)
|
||||
|
||||
if self.disable_mm_preprocessor_cache:
|
||||
logger.warning(
|
||||
logger.warning_once(
|
||||
"`--disable-mm-preprocessor-cache` is deprecated "
|
||||
"and will be removed in v0.13. "
|
||||
"Please use `--mm-processor-cache-gb 0` instead.",
|
||||
scope="local",
|
||||
)
|
||||
|
||||
self.mm_processor_cache_gb = 0
|
||||
elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
|
||||
logger.warning(
|
||||
logger.warning_once(
|
||||
"VLLM_MM_INPUT_CACHE_GIB` is deprecated "
|
||||
"and will be removed in v0.13. "
|
||||
"Please use `--mm-processor-cache-gb %d` instead.",
|
||||
envs.VLLM_MM_INPUT_CACHE_GIB,
|
||||
scope="local",
|
||||
)
|
||||
|
||||
self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB
|
||||
|
||||
if self.enable_multimodal_encoder_data_parallel:
|
||||
logger.warning(
|
||||
logger.warning_once(
|
||||
"--enable-multimodal-encoder-data-parallel` is deprecated "
|
||||
"and will be removed in v0.13. "
|
||||
"Please use `--mm-encoder-tp-mode data` instead."
|
||||
"Please use `--mm-encoder-tp-mode data` instead.",
|
||||
scope="local",
|
||||
)
|
||||
|
||||
self.mm_encoder_tp_mode = "data"
|
||||
@ -1369,11 +1392,10 @@ class EngineArgs:
|
||||
# Set default arguments for V1 Engine.
|
||||
self._set_default_args(usage_context, model_config)
|
||||
# Disable chunked prefill and prefix caching for:
|
||||
# POWER (ppc64le)/ARM/s390x/RISCV CPUs in V1
|
||||
# POWER (ppc64le)/s390x/RISCV CPUs in V1
|
||||
if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
|
||||
CpuArchEnum.POWERPC,
|
||||
CpuArchEnum.S390X,
|
||||
CpuArchEnum.ARM,
|
||||
CpuArchEnum.RISCV,
|
||||
):
|
||||
logger.info(
|
||||
@ -1952,10 +1974,11 @@ class EngineArgs:
|
||||
if self.prefill_context_parallel_size > 1:
|
||||
default_chunked_prefill = False
|
||||
default_prefix_caching = False
|
||||
logger.warning(
|
||||
logger.warning_once(
|
||||
"--prefill-context-parallel-size > 1 is not compatible with "
|
||||
"chunked prefill and prefix caching now. Chunked prefill "
|
||||
"and prefix caching have been disabled by default."
|
||||
"and prefix caching have been disabled by default.",
|
||||
scope="local",
|
||||
)
|
||||
|
||||
if self.enable_chunked_prefill is None:
|
||||
@ -1965,15 +1988,27 @@ class EngineArgs:
|
||||
"%s chunked prefill by default",
|
||||
"Enabling" if default_chunked_prefill else "Disabling",
|
||||
)
|
||||
elif (
|
||||
model_config.runner_type == "generate"
|
||||
and not self.enable_chunked_prefill
|
||||
and default_chunked_prefill
|
||||
):
|
||||
logger.warning_once(
|
||||
"This model does not officially support disabling chunked prefill. "
|
||||
"Disabling this manually may cause the engine to crash "
|
||||
"or produce incorrect outputs.",
|
||||
scope="local",
|
||||
)
|
||||
elif (
|
||||
model_config.runner_type == "pooling"
|
||||
and self.enable_chunked_prefill
|
||||
and not default_chunked_prefill
|
||||
):
|
||||
logger.warning(
|
||||
logger.warning_once(
|
||||
"This model does not officially support chunked prefill. "
|
||||
"Enabling this manually may cause the engine to crash "
|
||||
"or produce incorrect outputs.",
|
||||
scope="local",
|
||||
)
|
||||
|
||||
if self.enable_prefix_caching is None:
|
||||
@ -1988,10 +2023,11 @@ class EngineArgs:
|
||||
and self.enable_prefix_caching
|
||||
and not default_prefix_caching
|
||||
):
|
||||
logger.warning(
|
||||
logger.warning_once(
|
||||
"This model does not officially support prefix caching. "
|
||||
"Enabling this manually may cause the engine to crash "
|
||||
"or produce incorrect outputs.",
|
||||
scope="local",
|
||||
)
|
||||
|
||||
world_size = self.pipeline_parallel_size * self.tensor_parallel_size
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user