mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-17 04:47:03 +08:00
Merge branch 'main' into woosuk/input-prep
This commit is contained in:
commit
d6d719fb24
@ -49,23 +49,23 @@ function cpu_tests() {
|
||||
# Run kernel tests
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -v -s tests/kernels/test_onednn.py"
|
||||
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
# Note: disable until supports V1
|
||||
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||
|
||||
# Note: disable Bart until supports V1
|
||||
pytest -v -s tests/models/language/generation -m cpu_model \
|
||||
pytest -x -v -s tests/models/language/generation -m cpu_model \
|
||||
--ignore=tests/models/language/generation/test_bart.py
|
||||
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
|
||||
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
|
||||
--ignore=tests/models/language/generation/test_bart.py
|
||||
|
||||
pytest -v -s tests/models/language/pooling -m cpu_model
|
||||
pytest -v -s tests/models/multimodal/generation \
|
||||
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
||||
pytest -x -v -s tests/models/multimodal/generation \
|
||||
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||
-m cpu_model"
|
||||
@ -73,20 +73,20 @@ function cpu_tests() {
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
pytest -x -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||
|
||||
# Note: disable it until supports V1
|
||||
# Run AWQ test
|
||||
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
# set -e
|
||||
# VLLM_USE_V1=0 pytest -s -v \
|
||||
# VLLM_USE_V1=0 pytest -x -s -v \
|
||||
# tests/quantization/test_ipex_quant.py"
|
||||
|
||||
# Run multi-lora tests
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
pytest -x -s -v \
|
||||
tests/lora/test_qwen2vl.py"
|
||||
|
||||
# online serving
|
||||
|
||||
@ -234,7 +234,26 @@ steps:
|
||||
# OOM in the CI unless we run this separately
|
||||
- pytest -v -s tokenization
|
||||
|
||||
- label: V1 Test
|
||||
- label: V1 Test e2e + engine
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/engine
|
||||
|
||||
- label: V1 Test entrypoints
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
- pytest -v -s v1/entrypoints
|
||||
|
||||
- label: V1 Test others
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -242,8 +261,6 @@ steps:
|
||||
commands:
|
||||
# split the test to avoid interference
|
||||
- pytest -v -s v1/core
|
||||
- pytest -v -s v1/engine
|
||||
- pytest -v -s v1/entrypoints
|
||||
- pytest -v -s v1/executor
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
@ -256,9 +273,6 @@ steps:
|
||||
- pytest -v -s v1/test_utils.py
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
- pytest -v -s v1/test_metrics_reader.py
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
# Integration test for streaming correctness (requires special branch).
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
@ -798,6 +812,7 @@ steps:
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/weight_loading
|
||||
|
||||
@ -419,8 +419,10 @@ class BenchmarkWorker:
|
||||
)
|
||||
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
||||
# is the intermediate size after silu_and_mul.
|
||||
block_n = block_quant_shape[0] if block_quant_shape else None
|
||||
block_k = block_quant_shape[1] if block_quant_shape else None
|
||||
op_config = get_moe_configs(
|
||||
num_experts, shard_intermediate_size // 2, dtype_str
|
||||
num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k
|
||||
)
|
||||
if op_config is None:
|
||||
config = get_default_config(
|
||||
@ -430,6 +432,7 @@ class BenchmarkWorker:
|
||||
hidden_size,
|
||||
topk,
|
||||
dtype_str,
|
||||
block_quant_shape,
|
||||
)
|
||||
else:
|
||||
config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]
|
||||
|
||||
17
csrc/cache.h
17
csrc/cache.h
@ -36,6 +36,13 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
|
||||
const std::string& kv_cache_dtype,
|
||||
torch::Tensor& scale);
|
||||
|
||||
void cp_fused_concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
|
||||
torch::Tensor& cp_local_token_select_indices,
|
||||
torch::Tensor& kv_cache,
|
||||
torch::Tensor& slot_mapping,
|
||||
const std::string& kv_cache_dtype,
|
||||
torch::Tensor& scale);
|
||||
|
||||
// Just for unittest
|
||||
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
const double scale, const std::string& kv_cache_dtype);
|
||||
@ -47,4 +54,12 @@ void gather_and_maybe_dequant_cache(
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size, const std::string& kv_cache_dtype,
|
||||
torch::Tensor const& scale,
|
||||
std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||
std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||
|
||||
// TODO(hc): cp_gather_cache need support scaled kvcahe in the future.
|
||||
void cp_gather_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
#include <torch/all.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/cuda/CUDAException.h>
|
||||
|
||||
#include "cuda_utils.h"
|
||||
#include "cuda_compat.h"
|
||||
@ -395,6 +396,51 @@ __global__ void concat_and_cache_mla_kernel(
|
||||
copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||
__global__ void cp_fused_concat_and_cache_mla_kernel(
|
||||
const scalar_t* __restrict__ kv_c, // [num_full_tokens, kv_lora_rank]
|
||||
const scalar_t* __restrict__ k_pe, // [num_full_tokens, pe_dim]
|
||||
const int64_t* __restrict__ cp_local_token_select_indices, // [num_tokens]
|
||||
cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank
|
||||
// + pe_dim)]
|
||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||
const int block_stride, //
|
||||
const int entry_stride, //
|
||||
const int kv_c_stride, //
|
||||
const int k_pe_stride, //
|
||||
const int kv_lora_rank, //
|
||||
const int pe_dim, //
|
||||
const int block_size, //
|
||||
const float* scale //
|
||||
) {
|
||||
const int64_t token_idx = cp_local_token_select_indices[blockIdx.x];
|
||||
const int64_t slot_idx = slot_mapping[blockIdx.x];
|
||||
// NOTE: slot_idx can be -1 if the token is padded
|
||||
if (slot_idx < 0) {
|
||||
return;
|
||||
}
|
||||
const int64_t block_idx = slot_idx / block_size;
|
||||
const int64_t block_offset = slot_idx % block_size;
|
||||
|
||||
auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
|
||||
int src_stride, int dst_stride, int size, int offset) {
|
||||
for (int i = threadIdx.x; i < size; i += blockDim.x) {
|
||||
const int64_t src_idx = token_idx * src_stride + i;
|
||||
const int64_t dst_idx =
|
||||
block_idx * block_stride + block_offset * entry_stride + i + offset;
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
dst[dst_idx] = src[src_idx];
|
||||
} else {
|
||||
dst[dst_idx] =
|
||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
|
||||
copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
// KV_T is the data type of key and value tensors.
|
||||
@ -508,6 +554,20 @@ void reshape_and_cache_flash(
|
||||
kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
|
||||
reinterpret_cast<const float*>(scale.data_ptr()));
|
||||
|
||||
// KV_T is the data type of key and value tensors.
|
||||
// CACHE_T is the stored data type of kv-cache.
|
||||
// KV_DTYPE is the real data type of kv-cache.
|
||||
#define CALL_CP_FUSED_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::cp_fused_concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<KV_T*>(kv_c.data_ptr()), \
|
||||
reinterpret_cast<KV_T*>(k_pe.data_ptr()), \
|
||||
cp_local_token_select_indices.data_ptr<int64_t>(), \
|
||||
reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()), \
|
||||
slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
|
||||
kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
|
||||
reinterpret_cast<const float*>(scale.data_ptr()));
|
||||
|
||||
void concat_and_cache_mla(
|
||||
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
|
||||
torch::Tensor& k_pe, // [num_tokens, pe_dim]
|
||||
@ -546,6 +606,50 @@ void concat_and_cache_mla(
|
||||
CALL_CONCAT_AND_CACHE_MLA);
|
||||
}
|
||||
|
||||
// Note(hc): cp_fused_concat_and_cache_mla fuses the following three kernel
|
||||
// calls into one:
|
||||
// k_c_normed.index_select(0, cp_local_token_select_indices) + \
|
||||
// k_pe.squeeze(1).index_select(0, cp_local_token_select_indices) + \
|
||||
// concat_and_cache_mla.
|
||||
void cp_fused_concat_and_cache_mla(
|
||||
torch::Tensor& kv_c, // [num_total_tokens, kv_lora_rank]
|
||||
torch::Tensor& k_pe, // [num_total_tokens, pe_dim]
|
||||
torch::Tensor& cp_local_token_select_indices, // [num_tokens]
|
||||
torch::Tensor& kv_cache, // [num_blocks, block_size, (kv_lora_rank +
|
||||
// pe_dim)]
|
||||
torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
|
||||
const std::string& kv_cache_dtype, torch::Tensor& scale) {
|
||||
// NOTE(woosuk): In vLLM V1, key.size(0) can be different from
|
||||
// slot_mapping.size(0) because of padding for CUDA graphs.
|
||||
// In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
|
||||
// both include padding.
|
||||
// In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
|
||||
// since key includes padding for CUDA graphs, while slot_mapping does not.
|
||||
// In this case, slot_mapping.size(0) represents the actual number of tokens
|
||||
// before padding.
|
||||
// For compatibility with both cases, we use slot_mapping.size(0) as the
|
||||
// number of tokens.
|
||||
int num_tokens = slot_mapping.size(0);
|
||||
int kv_lora_rank = kv_c.size(1);
|
||||
int pe_dim = k_pe.size(1);
|
||||
int block_size = kv_cache.size(1);
|
||||
|
||||
TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
|
||||
|
||||
int kv_c_stride = kv_c.stride(0);
|
||||
int k_pe_stride = k_pe.stride(0);
|
||||
int block_stride = kv_cache.stride(0);
|
||||
int entry_stride = kv_cache.stride(1);
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(std::min(kv_lora_rank, 512));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
||||
CALL_CP_FUSED_CONCAT_AND_CACHE_MLA);
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
|
||||
template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
|
||||
@ -779,3 +883,146 @@ void gather_and_maybe_dequant_cache(
|
||||
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
template <typename scalar_t>
|
||||
// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
|
||||
// block_size.
|
||||
__global__ void cp_gather_cache(
|
||||
const scalar_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
|
||||
// ENTRY_SIZE]
|
||||
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRY_SIZE]
|
||||
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
|
||||
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
|
||||
const int32_t block_size, const int32_t entry_size,
|
||||
const int64_t block_table_stride, const int64_t cache_block_stride,
|
||||
const int64_t cache_entry_stride, const int64_t dst_entry_stride,
|
||||
const int32_t* __restrict__ seq_starts // Optional: starting offsets per
|
||||
// batch
|
||||
) {
|
||||
const int64_t bid = blockIdx.x; // Batch ID
|
||||
const int32_t num_splits = gridDim.y;
|
||||
const int32_t split = blockIdx.y;
|
||||
const int32_t seq_start = cu_seq_lens[bid];
|
||||
const int32_t seq_end = cu_seq_lens[bid + 1];
|
||||
const int32_t seq_len = seq_end - seq_start;
|
||||
const int32_t tot_slots = seq_len;
|
||||
const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
|
||||
|
||||
const int32_t split_start = split * split_slots;
|
||||
const int32_t split_end = min((split + 1) * split_slots, tot_slots);
|
||||
|
||||
const bool is_active_split = (split_start < tot_slots);
|
||||
const bool is_last_split = (split_end == tot_slots);
|
||||
|
||||
if (!is_active_split) return;
|
||||
|
||||
// Adjust the pointer for the block_table for this batch.
|
||||
// If seq_starts is provided, compute an offset based on it
|
||||
const int32_t batch_offset = bid * block_table_stride;
|
||||
int32_t offset = split_start;
|
||||
if (seq_starts != nullptr) {
|
||||
offset += seq_starts[bid];
|
||||
}
|
||||
int32_t offset_div = offset / block_size;
|
||||
offset = offset % block_size;
|
||||
const int32_t* batch_block_table = block_table + batch_offset;
|
||||
|
||||
// Adjust dst pointer based on the cumulative sequence lengths.
|
||||
dst += seq_start * dst_entry_stride;
|
||||
|
||||
auto copy_entry = [&](const scalar_t* __restrict__ _src,
|
||||
scalar_t* __restrict__ _dst) {
|
||||
for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
|
||||
_dst[i] = _src[i];
|
||||
};
|
||||
|
||||
for (int pid = split_start; pid < split_end; ++pid) {
|
||||
auto block_id = batch_block_table[offset_div];
|
||||
auto block_start_ptr = src_cache + block_id * cache_block_stride;
|
||||
auto block_dst_ptr = dst + pid * dst_entry_stride;
|
||||
copy_entry(block_start_ptr + offset * cache_entry_stride, block_dst_ptr);
|
||||
offset += 1;
|
||||
// bump to next block
|
||||
if (offset == block_size) {
|
||||
offset_div += 1;
|
||||
offset = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace vllm
|
||||
|
||||
// Macro to dispatch the kernel based on the data type.
|
||||
#define CALL_CP_GATHER_CACHE(CPY_DTYPE) \
|
||||
vllm::cp_gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<CPY_DTYPE*>(src_cache.data_ptr()), \
|
||||
reinterpret_cast<CPY_DTYPE*>(dst.data_ptr()), \
|
||||
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
|
||||
block_size, entry_size, block_table_stride, cache_block_stride, \
|
||||
cache_entry_stride, dst_entry_stride, seq_starts_ptr);
|
||||
|
||||
// Gather sequences from the cache into the destination tensor.
|
||||
// - cu_seq_lens contains the cumulative sequence lengths for each batch
|
||||
// - block_table contains the cache block indices for each sequence
|
||||
// - Optionally, seq_starts (if provided) offsets the starting slot index by
|
||||
// seq_starts[bid]
|
||||
void cp_gather_cache(
|
||||
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
|
||||
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
|
||||
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
|
||||
torch::Tensor const& cu_seq_lens, // [BATCH+1]
|
||||
int64_t batch_size,
|
||||
std::optional<torch::Tensor> seq_starts = std::nullopt) {
|
||||
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
int32_t block_size = src_cache.size(1);
|
||||
int32_t entry_size = src_cache.flatten(2, -1).size(2);
|
||||
|
||||
TORCH_CHECK(block_table.dtype() == torch::kInt32,
|
||||
"block_table must be int32");
|
||||
TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32,
|
||||
"cu_seq_lens must be int32");
|
||||
if (seq_starts.has_value()) {
|
||||
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
|
||||
"seq_starts must be int32");
|
||||
}
|
||||
|
||||
TORCH_CHECK(src_cache.device() == dst.device(),
|
||||
"src_cache and dst must be on the same device");
|
||||
TORCH_CHECK(src_cache.device() == block_table.device(),
|
||||
"src_cache and block_table must be on the same device");
|
||||
TORCH_CHECK(src_cache.device() == cu_seq_lens.device(),
|
||||
"src_cache and cu_seq_lens must be on the same device");
|
||||
if (seq_starts.has_value()) {
|
||||
TORCH_CHECK(src_cache.device() == seq_starts.value().device(),
|
||||
"src_cache and seq_starts must be on the same device");
|
||||
}
|
||||
|
||||
int64_t block_table_stride = block_table.stride(0);
|
||||
int64_t cache_block_stride = src_cache.stride(0);
|
||||
int64_t cache_entry_stride = src_cache.stride(1);
|
||||
int64_t dst_entry_stride = dst.stride(0);
|
||||
|
||||
// Decide on the number of splits based on the batch size.
|
||||
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
|
||||
dim3 grid(batch_size, num_splits);
|
||||
dim3 block(1024);
|
||||
|
||||
TORCH_CHECK(src_cache.dtype() == dst.dtype(),
|
||||
"src_cache and dst must have the same dtype");
|
||||
|
||||
const int dtype_bits = src_cache.element_size() * 8;
|
||||
const int32_t* seq_starts_ptr =
|
||||
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
|
||||
|
||||
if (dtype_bits == 32) {
|
||||
CALL_CP_GATHER_CACHE(uint32_t);
|
||||
} else if (dtype_bits == 16) {
|
||||
CALL_CP_GATHER_CACHE(uint16_t);
|
||||
} else if (dtype_bits == 8) {
|
||||
CALL_CP_GATHER_CACHE(uint8_t);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
|
||||
}
|
||||
}
|
||||
|
||||
@ -686,6 +686,16 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
" Tensor scale) -> ()");
|
||||
cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);
|
||||
|
||||
cache_ops.def(
|
||||
"cp_fused_concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
|
||||
" Tensor cp_local_token_select_indices,"
|
||||
" Tensor! kv_cache,"
|
||||
" Tensor slot_mapping,"
|
||||
" str kv_cache_dtype,"
|
||||
" Tensor scale) -> ()");
|
||||
cache_ops.impl("cp_fused_concat_and_cache_mla", torch::kCUDA,
|
||||
&cp_fused_concat_and_cache_mla);
|
||||
|
||||
// Convert the key and value cache to fp8 data type.
|
||||
cache_ops.def(
|
||||
"convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
|
||||
@ -702,6 +712,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
" Tensor scale, Tensor? seq_starts) -> ()");
|
||||
cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,
|
||||
&gather_and_maybe_dequant_cache);
|
||||
|
||||
cache_ops.def(
|
||||
"cp_gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, "
|
||||
"Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
|
||||
cache_ops.impl("cp_gather_cache", torch::kCUDA, &cp_gather_cache);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
|
||||
|
||||
@ -90,7 +90,7 @@ address the long build time at its source, the current workaround is to set `VLL
|
||||
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
|
||||
when manually triggering a build on Buildkite. This branch accomplishes two things:
|
||||
|
||||
1. Increase the timeout limit to 10 hours so that the build doesn't timeout.
|
||||
1. Increase the timeout limit to 10 hours so that the build doesn't time out.
|
||||
2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket
|
||||
to warm it up so that future builds are faster.
|
||||
|
||||
|
||||
@ -855,7 +855,7 @@ Examples:
|
||||
|
||||
### Custom HF processor
|
||||
|
||||
Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor].
|
||||
Some models don't define an HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor].
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@ -6,6 +6,6 @@ Supports speech-synthesis, multi-modal, and extensible (function call) plugin sy
|
||||
|
||||
One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application.
|
||||
|
||||
It supports vLLM as a AI model provider to efficiently serve large language models.
|
||||
It supports vLLM as an AI model provider to efficiently serve large language models.
|
||||
|
||||
For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm).
|
||||
|
||||
@ -380,7 +380,7 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
|
||||
|
||||
### Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"
|
||||
|
||||
If the startup or readiness probe failureThreshold is too low for the time needed to startup the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened:
|
||||
If the startup or readiness probe failureThreshold is too low for the time needed to start up the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened:
|
||||
|
||||
1. container log contains "KeyboardInterrupt: terminated"
|
||||
2. `kubectl get events` shows message `Container $NAME failed startup probe, will be restarted`
|
||||
|
||||
@ -138,7 +138,7 @@ Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & C
|
||||
|
||||
#### Step 1: Add an All2All manager
|
||||
|
||||
The purpose of the All2All Manager is to setup the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
|
||||
The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
|
||||
|
||||
#### Step 2: Add a FusedMoEPrepareAndFinalize Type
|
||||
|
||||
|
||||
@ -99,11 +99,11 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201
|
||||
|
||||
### Multi-process Mode
|
||||
|
||||
In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>.
|
||||
In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <gh-pr:7279>.
|
||||
|
||||
### Built in Python/Process Metrics
|
||||
|
||||
The following metrics are supported by default by `prometheus_client`, but they are not exposed when multi-process mode is used:
|
||||
The following metrics are supported by default by `prometheus_client`, but they are not exposed when multiprocess mode is used:
|
||||
|
||||
- `python_gc_objects_collected_total`
|
||||
- `python_gc_objects_uncollectable_total`
|
||||
|
||||
@ -52,7 +52,7 @@ Check out <gh-file:examples/offline_inference/multilora_inference.py> for an exa
|
||||
## Serving LoRA Adapters
|
||||
|
||||
LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use
|
||||
`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server:
|
||||
`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kick off the server:
|
||||
|
||||
```bash
|
||||
vllm serve meta-llama/Llama-2-7b-hf \
|
||||
|
||||
@ -143,7 +143,7 @@ OpenAI Python client library does not officially support `reasoning_content` att
|
||||
print(content, end="", flush=True)
|
||||
```
|
||||
|
||||
Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
|
||||
Remember to check whether the `reasoning_content` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
|
||||
|
||||
## Tool Calling
|
||||
|
||||
|
||||
@ -205,7 +205,7 @@ This section covers the OpenAI beta wrapper over the `client.chat.completions.cr
|
||||
|
||||
At the time of writing (`openai==1.54.4`), this is a "beta" feature in the OpenAI client library. Code reference can be found [here](https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104).
|
||||
|
||||
For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.1-8B-Instruct`
|
||||
For the following examples, vLLM was set up using `vllm serve meta-llama/Llama-3.1-8B-Instruct`
|
||||
|
||||
Here is a simple example demonstrating how to get structured output using Pydantic models:
|
||||
|
||||
|
||||
@ -140,8 +140,8 @@ Alternatively, users can directly call the NxDI library to trace and compile you
|
||||
|
||||
- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid
|
||||
compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the
|
||||
artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set,
|
||||
but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts
|
||||
artifacts under `neuron-compiled-artifacts/{unique_hash}/` subdirectory in the model path. If this environment variable is set,
|
||||
but the directory does not exist, or the contents are invalid, Neuron will also fall back to a new compilation and store the artifacts
|
||||
under this specified path.
|
||||
- `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend).
|
||||
- `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend).
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# --8<-- [start:installation]
|
||||
|
||||
vLLM has experimental support for macOS with Apple silicon. For now, users must build from source to natively run on macOS.
|
||||
vLLM has experimental support for macOS with Apple Silicon. For now, users must build from source to natively run on macOS.
|
||||
|
||||
Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
|
||||
|
||||
|
||||
@ -48,7 +48,7 @@ uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VE
|
||||
|
||||
#### Install the latest code
|
||||
|
||||
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
|
||||
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
|
||||
|
||||
```bash
|
||||
uv pip install -U vllm \
|
||||
|
||||
@ -149,7 +149,7 @@ Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm
|
||||
**This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
|
||||
If you choose to build this rocm_base image yourself, the steps are as follows.
|
||||
|
||||
It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
|
||||
It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
|
||||
|
||||
```json
|
||||
{
|
||||
@ -170,7 +170,7 @@ DOCKER_BUILDKIT=1 docker build \
|
||||
#### Build an image with vLLM
|
||||
|
||||
First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
|
||||
It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
|
||||
It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
|
||||
|
||||
```bash
|
||||
{
|
||||
|
||||
@ -258,4 +258,4 @@ Expected output:
|
||||
{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
|
||||
```
|
||||
|
||||
A openai client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
|
||||
An OpenAI client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
|
||||
|
||||
@ -40,7 +40,7 @@ If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it mean
|
||||
|
||||
#### Custom models
|
||||
|
||||
If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM!
|
||||
If a model is neither supported natively by vLLM nor Transformers, it can still be used in vLLM!
|
||||
|
||||
For a model to be compatible with the Transformers backend for vLLM it must:
|
||||
|
||||
@ -358,7 +358,7 @@ th {
|
||||
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
|
||||
| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ |
|
||||
| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | | ✅︎ |
|
||||
| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | ✅︎ | ✅︎ |
|
||||
| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
@ -497,6 +497,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | ✅︎ |
|
||||
| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | | ✅︎ |
|
||||
| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | ✅︎ |
|
||||
@ -513,6 +514,9 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
|
||||
vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}'
|
||||
```
|
||||
|
||||
!!! note
|
||||
The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture.
|
||||
|
||||
!!! note
|
||||
Load the official original `mxbai-rerank-v2` by using the following command.
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@ tail ~/.config/vllm/usage_stats.json
|
||||
|
||||
## Opting out
|
||||
|
||||
You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file:
|
||||
You can opt out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file:
|
||||
|
||||
```bash
|
||||
# Any of the following methods can disable usage stats collection
|
||||
|
||||
@ -138,7 +138,7 @@ def main():
|
||||
sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
|
||||
if not args.custom_mm_prompts:
|
||||
outputs = llm.generate(
|
||||
TokensPrompt(prompt_token_ids=prompt_ids),
|
||||
[TokensPrompt(prompt_token_ids=x) for x in prompt_ids],
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
else:
|
||||
|
||||
@ -1,10 +1,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import tempfile
|
||||
from enum import Enum
|
||||
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
|
||||
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
@ -33,6 +34,7 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
from vllm.sequence import Logprob
|
||||
from vllm.transformers_utils.utils import maybe_model_redirect
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -454,11 +456,10 @@ class HfRunner:
|
||||
# output is final logits
|
||||
all_inputs = self.get_inputs(prompts)
|
||||
outputs = []
|
||||
problem_type = getattr(self.config, "problem_type", "")
|
||||
|
||||
for inputs in all_inputs:
|
||||
output = self.model(**self.wrap_device(inputs))
|
||||
|
||||
problem_type = getattr(self.config, "problem_type", "")
|
||||
|
||||
if problem_type == "regression":
|
||||
logits = output.logits[0].tolist()
|
||||
elif problem_type == "multi_label_classification":
|
||||
@ -602,7 +603,7 @@ class HfRunner:
|
||||
def _hidden_states_to_logprobs(
|
||||
self,
|
||||
hidden_states: tuple[tuple[torch.Tensor, ...], ...],
|
||||
num_logprobs: int,
|
||||
num_logprobs: Optional[int],
|
||||
) -> tuple[list[dict[int, float]], int]:
|
||||
seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
|
||||
output_len = len(hidden_states)
|
||||
@ -630,7 +631,7 @@ class HfRunner:
|
||||
self,
|
||||
prompts: list[str],
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
num_logprobs: Optional[int],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
videos: Optional[PromptVideoInput] = None,
|
||||
@ -677,7 +678,7 @@ class HfRunner:
|
||||
self,
|
||||
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
num_logprobs: Optional[int],
|
||||
images: Optional[PromptImageInput] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[TokensTextLogprobs]:
|
||||
@ -966,7 +967,7 @@ class VllmRunner:
|
||||
self,
|
||||
prompts: list[str],
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
num_logprobs: Optional[int],
|
||||
num_prompt_logprobs: Optional[int] = None,
|
||||
images: Optional[PromptImageInput] = None,
|
||||
audios: Optional[PromptAudioInput] = None,
|
||||
@ -991,11 +992,40 @@ class VllmRunner:
|
||||
videos=videos,
|
||||
**kwargs)
|
||||
|
||||
def generate_prompt_perplexity(self, prompts: list[str]) -> list[float]:
|
||||
"""
|
||||
Return the perplexity score associated with generating the prompts
|
||||
|
||||
:param prompts: list of prompts to score
|
||||
:return: perplexity score of each prompt
|
||||
"""
|
||||
outputs = self.generate_greedy_logprobs(prompts,
|
||||
max_tokens=1,
|
||||
num_logprobs=None,
|
||||
num_prompt_logprobs=0)
|
||||
|
||||
perplexities = []
|
||||
for output in outputs:
|
||||
output = cast(TokensTextLogprobsPromptLogprobs, output)
|
||||
token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
|
||||
assert token_datas[0] is None
|
||||
token_log_probs = []
|
||||
for token_data in token_datas[1:]:
|
||||
assert token_data is not None
|
||||
assert len(token_data) == 1
|
||||
token_log_prob = list(token_data.values())[0].logprob
|
||||
token_log_probs.append(token_log_prob)
|
||||
|
||||
perplexity = math.exp(-sum(token_log_probs) / len(token_log_probs))
|
||||
perplexities.append(perplexity)
|
||||
|
||||
return perplexities
|
||||
|
||||
def generate_encoder_decoder_greedy_logprobs(
|
||||
self,
|
||||
encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
num_logprobs: Optional[int],
|
||||
num_prompt_logprobs: Optional[int] = None,
|
||||
skip_special_tokens: bool = True,
|
||||
) -> Union[list[TokensTextLogprobs],
|
||||
|
||||
@ -790,6 +790,78 @@ def test_gather_and_maybe_dequant_cache_mla(kv_lora_rank, qk_rope_head_dim,
|
||||
torch.testing.assert_close(dst, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kv_lora_rank", [512])
|
||||
@pytest.mark.parametrize("qk_rope_head_dim", [64])
|
||||
@pytest.mark.parametrize("block_size", [16])
|
||||
@pytest.mark.parametrize("num_blocks", [1024])
|
||||
@pytest.mark.parametrize("max_seq_len", [512])
|
||||
@pytest.mark.parametrize("batch_size", [8])
|
||||
@pytest.mark.parametrize("dtype", [torch.float32])
|
||||
@pytest.mark.parametrize("kv_cache_dtype",
|
||||
["auto"]) # You can also test "fp8" if needed.
|
||||
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||
@torch.inference_mode()
|
||||
def test_cp_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
|
||||
num_blocks, max_seq_len, batch_size, dtype,
|
||||
kv_cache_dtype, device):
|
||||
entry_size = kv_lora_rank + qk_rope_head_dim
|
||||
src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
|
||||
kv_cache_dtype, device)
|
||||
_fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
|
||||
|
||||
seq_len_tensor = torch.randint(0,
|
||||
max_seq_len + 1, (batch_size, ),
|
||||
device=device)
|
||||
|
||||
total_tokens = seq_len_tensor.sum()
|
||||
cu_seq_lens = torch.empty((batch_size + 1),
|
||||
dtype=torch.int32,
|
||||
device=device)
|
||||
cu_seq_lens[0] = 0
|
||||
cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
|
||||
print("seq_len_tensor", seq_len_tensor)
|
||||
|
||||
tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
|
||||
block_table = torch.empty((batch_size, num_blocks),
|
||||
dtype=torch.int32,
|
||||
device=device)
|
||||
|
||||
for b in range(batch_size):
|
||||
perm = torch.randperm(num_blocks, device=device)
|
||||
block_table[b, :] = perm
|
||||
|
||||
dst = torch.zeros((total_tokens, entry_size),
|
||||
dtype=src_cache.dtype,
|
||||
device=device)
|
||||
|
||||
expected_batches = []
|
||||
for b in range(batch_size):
|
||||
s = seq_len_tensor[b]
|
||||
if s == 0:
|
||||
continue
|
||||
tot = tot_blocks_tensor[b]
|
||||
blocks = block_table[b, :tot].tolist()
|
||||
|
||||
gathered_rows = []
|
||||
for i in range(tot - 1):
|
||||
gathered_rows.append(src_cache[blocks[i]])
|
||||
remaining = s - (tot - 1) * block_size
|
||||
gathered_rows.append(src_cache[blocks[-1], :remaining, :])
|
||||
|
||||
batch_expected = torch.cat(gathered_rows, dim=0)
|
||||
expected_batches.append(batch_expected)
|
||||
expected = torch.cat(expected_batches, dim=0)
|
||||
|
||||
opcheck(
|
||||
torch.ops._C_cache_ops.cp_gather_cache,
|
||||
(src_cache, dst, block_table, cu_seq_lens, batch_size, None),
|
||||
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
|
||||
)
|
||||
|
||||
ops.cp_gather_cache(src_cache, dst, block_table, cu_seq_lens, batch_size)
|
||||
torch.testing.assert_close(dst, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
|
||||
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
|
||||
@pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA)
|
||||
|
||||
@ -51,6 +51,9 @@ def correctness_test_embed_models(hf_runner,
|
||||
vllm_extra_kwargs = vllm_extra_kwargs or {}
|
||||
vllm_extra_kwargs["dtype"] = model_info.dtype
|
||||
|
||||
if model_info.hf_overrides is not None:
|
||||
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
|
||||
@ -172,6 +172,9 @@ def mteb_test_embed_models(hf_runner,
|
||||
vllm_extra_kwargs = vllm_extra_kwargs or {}
|
||||
vllm_extra_kwargs["dtype"] = model_info.dtype
|
||||
|
||||
if model_info.hf_overrides is not None:
|
||||
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
@ -284,6 +287,9 @@ def mteb_test_rerank_models(hf_runner,
|
||||
vllm_extra_kwargs = vllm_extra_kwargs or {}
|
||||
vllm_extra_kwargs["dtype"] = model_info.dtype
|
||||
|
||||
if model_info.hf_overrides is not None:
|
||||
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
|
||||
|
||||
with vllm_runner(model_info.name,
|
||||
runner="pooling",
|
||||
max_model_len=None,
|
||||
|
||||
@ -13,7 +13,14 @@ from .mteb_utils import VllmMtebEncoder, mteb_test_rerank_models
|
||||
|
||||
RERANK_MODELS = [
|
||||
LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma",
|
||||
architecture="GemmaForSequenceClassification"),
|
||||
architecture="GemmaForSequenceClassification",
|
||||
hf_overrides={
|
||||
"architectures":
|
||||
["GemmaForSequenceClassification"],
|
||||
"classifier_from_token": ["Yes"],
|
||||
"method":
|
||||
"no_post_processing",
|
||||
}),
|
||||
]
|
||||
|
||||
PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." # noqa: E501
|
||||
@ -119,22 +126,9 @@ class GemmaMtebEncoder(VllmMtebEncoder):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo,
|
||||
monkeypatch) -> None:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
assert model_info.architecture == "GemmaForSequenceClassification"
|
||||
|
||||
vllm_extra_kwargs: dict[str, Any] = {
|
||||
"hf_overrides": {
|
||||
"architectures": ["GemmaForSequenceClassification"],
|
||||
"classifier_from_token": ["Yes"],
|
||||
"method": "no_post_processing",
|
||||
}
|
||||
}
|
||||
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
|
||||
|
||||
mteb_test_rerank_models(GemmaRerankerHfRunner,
|
||||
vllm_runner,
|
||||
model_info,
|
||||
vllm_extra_kwargs,
|
||||
vllm_mteb_encoder=GemmaMtebEncoder)
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
@ -33,12 +32,15 @@ MODELS = [
|
||||
########### NewModel
|
||||
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
|
||||
architecture="GteNewModel",
|
||||
hf_overrides={"architectures": ["GteNewModel"]},
|
||||
enable_test=True),
|
||||
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
|
||||
architecture="GteNewModel",
|
||||
hf_overrides={"architectures": ["GteNewModel"]},
|
||||
enable_test=True),
|
||||
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
|
||||
architecture="GteNewModel",
|
||||
hf_overrides={"architectures": ["GteNewModel"]},
|
||||
enable_test=True),
|
||||
########### Qwen2ForCausalLM
|
||||
LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
|
||||
@ -60,11 +62,16 @@ MODELS = [
|
||||
]
|
||||
|
||||
RERANK_MODELS = [
|
||||
# classifier_pooling: mean
|
||||
CLSPoolingRerankModelInfo(
|
||||
# classifier_pooling: mean
|
||||
"Alibaba-NLP/gte-reranker-modernbert-base",
|
||||
architecture="ModernBertForSequenceClassification",
|
||||
enable_test=True),
|
||||
CLSPoolingRerankModelInfo(
|
||||
"Alibaba-NLP/gte-multilingual-reranker-base",
|
||||
architecture="GteNewForSequenceClassification",
|
||||
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
|
||||
enable_test=True),
|
||||
]
|
||||
|
||||
|
||||
@ -75,12 +82,7 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
check_transformers_version(model_info.name,
|
||||
max_transformers_version="4.53.2")
|
||||
|
||||
vllm_extra_kwargs: dict[str, Any] = {}
|
||||
if model_info.architecture == "GteNewModel":
|
||||
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
|
||||
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info,
|
||||
vllm_extra_kwargs)
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
@ -91,12 +93,8 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
|
||||
check_transformers_version(model_info.name,
|
||||
max_transformers_version="4.53.2")
|
||||
|
||||
vllm_extra_kwargs: dict[str, Any] = {}
|
||||
if model_info.architecture == "GteNewModel":
|
||||
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
|
||||
|
||||
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
|
||||
example_prompts, vllm_extra_kwargs)
|
||||
example_prompts)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
|
||||
@ -10,12 +10,20 @@ from tests.conftest import HfRunner
|
||||
from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
|
||||
from .mteb_utils import mteb_test_rerank_models
|
||||
|
||||
mxbai_rerank_hf_overrides = {
|
||||
"architectures": ["Qwen2ForSequenceClassification"],
|
||||
"classifier_from_token": ["0", "1"],
|
||||
"method": "from_2_way_softmax",
|
||||
}
|
||||
|
||||
RERANK_MODELS = [
|
||||
LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
|
||||
architecture="Qwen2ForSequenceClassification",
|
||||
hf_overrides=mxbai_rerank_hf_overrides,
|
||||
enable_test=True),
|
||||
LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
|
||||
architecture="Qwen2ForSequenceClassification",
|
||||
hf_overrides=mxbai_rerank_hf_overrides,
|
||||
enable_test=False)
|
||||
]
|
||||
|
||||
@ -71,13 +79,4 @@ class MxbaiRerankerHfRunner(HfRunner):
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
|
||||
vllm_extra_kwargs: dict[str, Any] = {}
|
||||
if model_info.architecture == "Qwen2ForSequenceClassification":
|
||||
vllm_extra_kwargs["hf_overrides"] = {
|
||||
"architectures": ["Qwen2ForSequenceClassification"],
|
||||
"classifier_from_token": ["0", "1"],
|
||||
"method": "from_2_way_softmax",
|
||||
}
|
||||
|
||||
mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info,
|
||||
vllm_extra_kwargs)
|
||||
mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info)
|
||||
|
||||
@ -11,12 +11,20 @@ from tests.utils import multi_gpu_test
|
||||
from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
|
||||
from .mteb_utils import mteb_test_rerank_models
|
||||
|
||||
qwen3_reranker_hf_overrides = {
|
||||
"architectures": ["Qwen3ForSequenceClassification"],
|
||||
"classifier_from_token": ["no", "yes"],
|
||||
"is_original_qwen3_reranker": True,
|
||||
}
|
||||
|
||||
RERANK_MODELS = [
|
||||
LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
|
||||
architecture="Qwen3ForSequenceClassification",
|
||||
hf_overrides=qwen3_reranker_hf_overrides,
|
||||
enable_test=True),
|
||||
LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B",
|
||||
architecture="Qwen3ForSequenceClassification",
|
||||
hf_overrides=qwen3_reranker_hf_overrides,
|
||||
enable_test=False)
|
||||
]
|
||||
|
||||
@ -74,18 +82,7 @@ class Qwen3RerankerHfRunner(HfRunner):
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
|
||||
|
||||
assert model_info.architecture == "Qwen3ForSequenceClassification"
|
||||
|
||||
vllm_extra_kwargs: dict[str, Any] = {
|
||||
"hf_overrides": {
|
||||
"architectures": ["Qwen3ForSequenceClassification"],
|
||||
"classifier_from_token": ["no", "yes"],
|
||||
"is_original_qwen3_reranker": True,
|
||||
}
|
||||
}
|
||||
|
||||
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
|
||||
vllm_extra_kwargs)
|
||||
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", RERANK_MODELS)
|
||||
@ -96,11 +93,6 @@ def test_rerank_models_mteb_tp(vllm_runner,
|
||||
assert model_info.architecture == "Qwen3ForSequenceClassification"
|
||||
|
||||
vllm_extra_kwargs: dict[str, Any] = {
|
||||
"hf_overrides": {
|
||||
"architectures": ["Qwen3ForSequenceClassification"],
|
||||
"classifier_from_token": ["no", "yes"],
|
||||
"is_original_qwen3_reranker": True,
|
||||
},
|
||||
"tensor_parallel_size": 2,
|
||||
}
|
||||
|
||||
|
||||
@ -189,23 +189,21 @@ VLM_TEST_SETTINGS = {
|
||||
},
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
# FIXME(Isotr0py): Enable this test after
|
||||
# https://github.com/huggingface/transformers/pull/39470 released
|
||||
# "idefics3-transformers": VLMTestInfo(
|
||||
# models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
|
||||
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
# prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
||||
# img_idx_to_prompt=lambda idx: "<image>",
|
||||
# max_model_len=8192,
|
||||
# max_num_seqs=2,
|
||||
# auto_cls=AutoModelForImageTextToText,
|
||||
# hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
|
||||
# image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
# vllm_runner_kwargs={
|
||||
# "model_impl": "transformers",
|
||||
# },
|
||||
# marks=[pytest.mark.core_model],
|
||||
# ),
|
||||
"idefics3-transformers": VLMTestInfo(
|
||||
models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>",
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
|
||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
vllm_runner_kwargs={
|
||||
"model_impl": "transformers",
|
||||
},
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
# Pixel values from processor are not 4D or 5D arrays
|
||||
"qwen2_5_vl-transformers": VLMTestInfo(
|
||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||
@ -322,10 +320,6 @@ VLM_TEST_SETTINGS = {
|
||||
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
|
||||
num_logprobs=10,
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
# FIXME(Isotr0py): This model is broken in Transformers v4.54.1, we
|
||||
# should enable this again after the fix is released:
|
||||
# https://github.com/huggingface/transformers/pull/39915
|
||||
marks=[pytest.mark.skip("HF model is broken")],
|
||||
),
|
||||
"gemma3": VLMTestInfo(
|
||||
models=["google/gemma-3-4b-it"],
|
||||
|
||||
@ -365,6 +365,10 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
|
||||
|
||||
# [Cross-encoder]
|
||||
"BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True), # noqa: E501
|
||||
"GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base", # noqa: E501
|
||||
trust_remote_code=True,
|
||||
hf_overrides={
|
||||
"architectures": ["GteNewForSequenceClassification"]}),# noqa: E501
|
||||
"ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501
|
||||
"RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True), # noqa: E501
|
||||
"XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True), # noqa: E501
|
||||
|
||||
@ -3,7 +3,8 @@
|
||||
|
||||
import warnings
|
||||
from collections.abc import Sequence
|
||||
from typing import Any, NamedTuple, Optional, Union
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
@ -339,36 +340,43 @@ def softmax(data):
|
||||
return F.softmax(data, dim=-1)
|
||||
|
||||
|
||||
class EmbedModelInfo(NamedTuple):
|
||||
@dataclass
|
||||
class ModelInfo:
|
||||
name: str
|
||||
is_matryoshka: bool = False
|
||||
matryoshka_dimensions: Optional[list[int]] = None
|
||||
architecture: str = ""
|
||||
dtype: str = "auto"
|
||||
hf_overrides: Optional[dict[str, Any]] = None
|
||||
default_pooling_type: str = ""
|
||||
enable_test: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class EmbedModelInfo(ModelInfo):
|
||||
is_matryoshka: bool = False
|
||||
matryoshka_dimensions: Optional[list[int]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CLSPoolingEmbedModelInfo(EmbedModelInfo):
|
||||
default_pooling_type: str = "CLS"
|
||||
|
||||
|
||||
@dataclass
|
||||
class LASTPoolingEmbedModelInfo(EmbedModelInfo):
|
||||
default_pooling_type: str = "LAST"
|
||||
|
||||
|
||||
class RerankModelInfo(NamedTuple):
|
||||
name: str
|
||||
architecture: str = ""
|
||||
dtype: str = "auto"
|
||||
default_pooling_type: str = ""
|
||||
enable_test: bool = True
|
||||
@dataclass
|
||||
class RerankModelInfo(ModelInfo):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class CLSPoolingRerankModelInfo(RerankModelInfo):
|
||||
default_pooling_type: str = "CLS"
|
||||
|
||||
|
||||
@dataclass
|
||||
class LASTPoolingRerankModelInfo(RerankModelInfo):
|
||||
default_pooling_type: str = "LAST"
|
||||
|
||||
|
||||
@ -719,3 +719,25 @@ def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda(),
|
||||
reason="This test is skipped on non-CUDA platform.")
|
||||
@pytest.mark.parametrize("model,prompt,exp_perplexity", [
|
||||
(
|
||||
"nm-testing/Llama-3.2-1B-Instruct-spinquantR1R2R4-w4a16",
|
||||
"Flat is better than nested.\nSparse is better than dense.",
|
||||
150.0,
|
||||
),
|
||||
(
|
||||
"nm-testing/Llama-3.2-1B-Instruct-quip-w4a16",
|
||||
"Flat is better than nested.\nSparse is better than dense.",
|
||||
150.0,
|
||||
),
|
||||
])
|
||||
def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt,
|
||||
exp_perplexity):
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
perplexity = llm.generate_prompt_perplexity([prompt])[0]
|
||||
print(perplexity)
|
||||
assert perplexity <= exp_perplexity
|
||||
@ -1625,6 +1625,20 @@ def concat_and_cache_mla(
|
||||
scale)
|
||||
|
||||
|
||||
def cp_fused_concat_and_cache_mla(
|
||||
kv_c: torch.Tensor,
|
||||
k_pe: torch.Tensor,
|
||||
cp_local_token_select_indices: torch.Tensor,
|
||||
kv_cache: torch.Tensor,
|
||||
slot_mapping: torch.Tensor,
|
||||
kv_cache_dtype: str,
|
||||
scale: torch.Tensor,
|
||||
) -> None:
|
||||
torch.ops._C_cache_ops.cp_fused_concat_and_cache_mla(
|
||||
kv_c, k_pe, cp_local_token_select_indices, kv_cache, slot_mapping,
|
||||
kv_cache_dtype, scale)
|
||||
|
||||
|
||||
def copy_blocks(key_caches: list[torch.Tensor],
|
||||
value_caches: list[torch.Tensor],
|
||||
block_mapping: torch.Tensor) -> None:
|
||||
@ -1662,6 +1676,16 @@ def gather_and_maybe_dequant_cache(
|
||||
scale, seq_starts)
|
||||
|
||||
|
||||
def cp_gather_cache(src_cache: torch.Tensor,
|
||||
dst: torch.Tensor,
|
||||
block_table: torch.Tensor,
|
||||
cu_seq_lens: torch.Tensor,
|
||||
batch_size: int,
|
||||
seq_starts: Optional[torch.Tensor] = None) -> None:
|
||||
torch.ops._C_cache_ops.cp_gather_cache(src_cache, dst, block_table,
|
||||
cu_seq_lens, batch_size, seq_starts)
|
||||
|
||||
|
||||
def get_device_attribute(attribute: int, device: int) -> int:
|
||||
return torch.ops._C_cuda_utils.get_device_attribute(attribute, device)
|
||||
|
||||
|
||||
@ -10,6 +10,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from .inductor_pass import enable_fake_mode
|
||||
from .vllm_inductor_pass import VllmInductorPass
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -61,6 +62,7 @@ class ActivationQuantFusionPass(VllmInductorPass):
|
||||
https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
|
||||
"""
|
||||
|
||||
@enable_fake_mode
|
||||
def __init__(self, config: VllmConfig):
|
||||
super().__init__(config)
|
||||
|
||||
|
||||
@ -271,7 +271,7 @@ def split_graph(graph: fx.GraphModule,
|
||||
outputs.append(
|
||||
SplitItem(name, graph_id, (graph_id in split_op_graphs), module))
|
||||
|
||||
# sort by intetger graph_id, rather than string name
|
||||
# sort by integer graph_id, rather than string name
|
||||
outputs.sort(key=lambda x: x.graph_id)
|
||||
|
||||
return split_gm, outputs
|
||||
@ -424,7 +424,7 @@ class VllmBackend:
|
||||
|
||||
# if the model is initialized with a non-empty prefix,
|
||||
# then usually it's enough to use that prefix,
|
||||
# e.g. launguage_model, vision_model, etc.
|
||||
# e.g. language_model, vision_model, etc.
|
||||
# when multiple parts are initialized as independent
|
||||
# models, we need to use the model_tag to distinguish
|
||||
# them, e.g. backbone (default), eagle_head, etc.
|
||||
|
||||
@ -19,6 +19,7 @@ from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import direct_register_custom_op
|
||||
|
||||
from .inductor_pass import enable_fake_mode
|
||||
from .vllm_inductor_pass import VllmInductorPass
|
||||
|
||||
FP8_DTYPE = current_platform.fp8_dtype()
|
||||
@ -349,6 +350,7 @@ class AllGatherCutlassScaledMMPattern(BasePattern):
|
||||
|
||||
class AsyncTPPass(VllmInductorPass):
|
||||
|
||||
@enable_fake_mode
|
||||
def __init__(self, config: VllmConfig):
|
||||
super().__init__(config)
|
||||
|
||||
@ -1121,6 +1123,10 @@ class AllReduceFusionPass(VllmInductorPass):
|
||||
# in fallback path, when we don't use flashinfer
|
||||
fuse_rms_quant=config.compilation_config.pass_config.enable_fusion)
|
||||
|
||||
self.register_patterns()
|
||||
|
||||
@enable_fake_mode
|
||||
def register_patterns(self):
|
||||
for epsilon in [1e-5, 1e-6]:
|
||||
AllReduceFusedRMSNormStaticQuantFP8Pattern(
|
||||
epsilon,
|
||||
|
||||
@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from .fx_utils import find_getitem_maybe
|
||||
from .inductor_pass import enable_fake_mode
|
||||
from .multi_output_match import MultiOutputMatch
|
||||
from .vllm_inductor_pass import VllmInductorPass
|
||||
|
||||
@ -528,6 +529,7 @@ class FusionPass(VllmInductorPass):
|
||||
cls._instance.pass_config = config.compilation_config.pass_config
|
||||
return cls._instance
|
||||
|
||||
@enable_fake_mode
|
||||
def __init__(self, config: VllmConfig):
|
||||
assert self.__class__._instance is None, \
|
||||
"FusionPass singleton instance already exists"
|
||||
|
||||
@ -7,8 +7,6 @@ import torch
|
||||
import torch._inductor.pattern_matcher as pm
|
||||
from torch._higher_order_ops.auto_functionalize import auto_functionalized
|
||||
from torch._inductor.pattern_matcher import PatternMatcherPass
|
||||
from torch._subclasses.fake_tensor import (FakeTensorMode,
|
||||
unset_fake_temporarily)
|
||||
|
||||
from vllm.attention import Attention
|
||||
from vllm.config import VllmConfig, get_layers_from_vllm_config
|
||||
@ -19,6 +17,7 @@ from vllm.platforms import current_platform
|
||||
from vllm.utils import round_up
|
||||
|
||||
from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32
|
||||
from .inductor_pass import enable_fake_mode
|
||||
from .vllm_inductor_pass import VllmInductorPass
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -139,24 +138,21 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
|
||||
output_block_scale=None)
|
||||
return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size])
|
||||
|
||||
# Need custom fake mode, otherwise tracing happens with real tensors.
|
||||
# That would not work for the unified_attention custom op.
|
||||
with unset_fake_temporarily(), FakeTensorMode():
|
||||
inputs = [
|
||||
empty_bf16(5, self.num_heads, self.head_size), # q
|
||||
empty_bf16(5, self.num_heads, self.head_size), # k
|
||||
empty_bf16(5, self.num_heads, self.head_size), # v
|
||||
empty_bf16(5, self.num_heads, self.head_size), # attn_output
|
||||
self.empty_quant(5, self.num_heads *
|
||||
self.head_size), # quant_output
|
||||
empty_fp32(1, 1) # scale
|
||||
]
|
||||
inputs = [
|
||||
empty_bf16(5, self.num_heads, self.head_size), # q
|
||||
empty_bf16(5, self.num_heads, self.head_size), # k
|
||||
empty_bf16(5, self.num_heads, self.head_size), # v
|
||||
empty_bf16(5, self.num_heads, self.head_size), # attn_output
|
||||
self.empty_quant(5,
|
||||
self.num_heads * self.head_size), # quant_output
|
||||
empty_fp32(1, 1) # scale
|
||||
]
|
||||
|
||||
pm.register_replacement(
|
||||
pattern, replacement, inputs,
|
||||
AttentionQuantPattern.wrap_trace_fn(
|
||||
AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
|
||||
pm_pass)
|
||||
pm.register_replacement(
|
||||
pattern, replacement, inputs,
|
||||
AttentionQuantPattern.wrap_trace_fn(
|
||||
AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
|
||||
pm_pass)
|
||||
|
||||
|
||||
class AttentionNvfp4QuantPattern(AttentionQuantPattern):
|
||||
@ -219,27 +215,23 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
|
||||
[-1, self.num_heads * self.head_size // 2])
|
||||
return output, at2[2]
|
||||
|
||||
# Need custom fake mode, otherwise tracing happens with real tensors.
|
||||
# That would not work for the unified_attention custom op.
|
||||
with unset_fake_temporarily(), FakeTensorMode():
|
||||
inputs = [
|
||||
empty_bf16(5, self.num_heads, self.head_size), # q
|
||||
empty_bf16(5, self.num_heads, self.head_size), # k
|
||||
empty_bf16(5, self.num_heads, self.head_size), # v
|
||||
empty_bf16(5, self.num_heads, self.head_size), # output_attn
|
||||
self.empty_quant(5, self.num_heads * self.head_size //
|
||||
2), # output_quant
|
||||
empty_i32(128,
|
||||
round_up(self.num_heads * self.head_size // 16,
|
||||
4)), # output_scale
|
||||
empty_fp32(1, 1), # input_scale
|
||||
]
|
||||
inputs = [
|
||||
empty_bf16(5, self.num_heads, self.head_size), # q
|
||||
empty_bf16(5, self.num_heads, self.head_size), # k
|
||||
empty_bf16(5, self.num_heads, self.head_size), # v
|
||||
empty_bf16(5, self.num_heads, self.head_size), # output_attn
|
||||
self.empty_quant(5, self.num_heads * self.head_size //
|
||||
2), # output_quant
|
||||
empty_i32(128, round_up(self.num_heads * self.head_size // 16,
|
||||
4)), # output_scale
|
||||
empty_fp32(1, 1), # input_scale
|
||||
]
|
||||
|
||||
pm.register_replacement(
|
||||
pattern, replacement, inputs,
|
||||
AttentionQuantPattern.wrap_trace_fn(
|
||||
AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
|
||||
pm_pass)
|
||||
pm.register_replacement(
|
||||
pattern, replacement, inputs,
|
||||
AttentionQuantPattern.wrap_trace_fn(
|
||||
AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
|
||||
pm_pass)
|
||||
|
||||
|
||||
class AttnFusionPass(VllmInductorPass):
|
||||
@ -255,6 +247,7 @@ class AttnFusionPass(VllmInductorPass):
|
||||
support are attention kernels, which need to support fusing output quant.
|
||||
"""
|
||||
|
||||
@enable_fake_mode
|
||||
def __init__(self, config: VllmConfig):
|
||||
super().__init__(config)
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import functools
|
||||
import hashlib
|
||||
import inspect
|
||||
import json
|
||||
@ -10,6 +11,8 @@ from typing import Any, Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import fx
|
||||
from torch._subclasses.fake_tensor import (FakeTensorMode,
|
||||
unset_fake_temporarily)
|
||||
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
@ -114,3 +117,20 @@ class CallableInductorPass(InductorPass):
|
||||
|
||||
def uuid(self) -> Any:
|
||||
return self._uuid
|
||||
|
||||
|
||||
def enable_fake_mode(fn: Callable[..., Any]) -> Callable[..., Any]:
|
||||
"""
|
||||
Applies a FakeTensorMode context. This is useful when you don't want to
|
||||
create or run things with real tensors.
|
||||
"""
|
||||
|
||||
@functools.wraps(fn)
|
||||
def fn_new(*args, **kwargs) -> Any:
|
||||
with torch._guards.tracing(
|
||||
None), unset_fake_temporarily(), FakeTensorMode():
|
||||
result = fn(*args, **kwargs)
|
||||
|
||||
return result
|
||||
|
||||
return fn_new
|
||||
|
||||
@ -14,6 +14,7 @@ from vllm.distributed.parallel_state import (
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from .inductor_pass import enable_fake_mode
|
||||
from .vllm_inductor_pass import VllmInductorPass
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -436,6 +437,7 @@ class SequenceParallelismPass(VllmInductorPass):
|
||||
performance.
|
||||
"""
|
||||
|
||||
@enable_fake_mode
|
||||
def __init__(self, config: VllmConfig):
|
||||
super().__init__(config)
|
||||
|
||||
|
||||
@ -2439,8 +2439,8 @@ class LoRAConfig:
|
||||
lora_dtype: Union[torch.dtype, LoRADType] = "auto"
|
||||
"""Data type for LoRA. If auto, will default to base model dtype."""
|
||||
lora_extra_vocab_size: int = 256
|
||||
"""Maximum size of extra vocabulary that can be present in a LoRA adapter
|
||||
(added to the base model vocabulary)."""
|
||||
"""(Deprecated) Maximum size of extra vocabulary that can be present in a
|
||||
LoRA adapter. Will be removed in v0.12.0."""
|
||||
lora_vocab_padding_size: ClassVar[int] = current_platform\
|
||||
.get_lora_vocab_padding_size()
|
||||
|
||||
@ -2482,6 +2482,12 @@ class LoRAConfig:
|
||||
return hash_str
|
||||
|
||||
def __post_init__(self):
|
||||
# Deprecation warning for lora_extra_vocab_size
|
||||
logger.warning(
|
||||
"`lora_extra_vocab_size` is deprecated and will be removed "
|
||||
"in v0.12.0. Additional vocabulary support for "
|
||||
"LoRA adapters is being phased out.")
|
||||
|
||||
# Setting the maximum rank to 512 should be able to satisfy the vast
|
||||
# majority of applications.
|
||||
possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
|
||||
|
||||
@ -115,7 +115,7 @@ class CacheConfig:
|
||||
|
||||
In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
|
||||
some layers can skip tokens corresponding to prefill. This flag enables
|
||||
attention metadata for eligible layers to be overriden with metadata
|
||||
attention metadata for eligible layers to be overridden with metadata
|
||||
necessary for implementing this optimization in some models (e.g. Gemma3n)
|
||||
"""
|
||||
|
||||
|
||||
@ -1053,7 +1053,7 @@ class EngineArgs:
|
||||
self.trust_remote_code, self.revision,
|
||||
self.code_revision, self.config_format)
|
||||
|
||||
# if loading a SpeculatorsConfig, load the specualtive_config
|
||||
# if loading a SpeculatorsConfig, load the speculative_config
|
||||
# details from the config directly
|
||||
# no user input required / expected
|
||||
if isinstance(hf_config, SpeculatorsConfig):
|
||||
|
||||
@ -640,7 +640,7 @@ class BaseMultiModalContentParser(ABC):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
# stores model placehodlers list with corresponding
|
||||
# stores model placeholders list with corresponding
|
||||
# general MM placeholder:
|
||||
# {
|
||||
# "<##IMAGE##>": ["<image>", "<image>", "<image>"],
|
||||
|
||||
@ -1096,7 +1096,7 @@ if envs.VLLM_SERVER_DEV_MODE:
|
||||
raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
|
||||
detail="Missing 'method' in request body")
|
||||
# For security reason, only serialized string args/kwargs are passed.
|
||||
# User-defined `method` is responsible for deseralization if needed.
|
||||
# User-defined `method` is responsible for deserialization if needed.
|
||||
args: list[str] = body.get("args", [])
|
||||
kwargs: dict[str, str] = body.get("kwargs", {})
|
||||
timeout: Optional[float] = body.get("timeout")
|
||||
|
||||
@ -313,12 +313,14 @@ def log_non_default_args(args: Union[argparse.Namespace, EngineArgs]):
|
||||
|
||||
# Handle EngineArgs instance
|
||||
elif isinstance(args, EngineArgs):
|
||||
default_args = EngineArgs() # Create default instance
|
||||
default_args = EngineArgs(model=args.model) # Create default instance
|
||||
for field in dataclasses.fields(args):
|
||||
current_val = getattr(args, field.name)
|
||||
default_val = getattr(default_args, field.name)
|
||||
if current_val != default_val:
|
||||
non_default_args[field.name] = current_val
|
||||
if default_args.model != EngineArgs.model:
|
||||
non_default_args["model"] = default_args.model
|
||||
else:
|
||||
raise TypeError("Unsupported argument type. " \
|
||||
"Must be argparse.Namespace or EngineArgs instance.")
|
||||
|
||||
@ -190,12 +190,6 @@ class FusedMoEParallelConfig:
|
||||
return (self.use_all2all_kernels
|
||||
and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency")
|
||||
|
||||
@property
|
||||
def use_flashinfer_cutlass_kernels(self):
|
||||
return (envs.VLLM_USE_FLASHINFER_MOE_FP4
|
||||
and has_flashinfer_cutlass_fused_moe()
|
||||
and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput")
|
||||
|
||||
@staticmethod
|
||||
def make(tp_size_: int, dp_size_: int,
|
||||
vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig":
|
||||
@ -404,7 +398,14 @@ class FusedMoEConfig:
|
||||
|
||||
@property
|
||||
def use_flashinfer_cutlass_kernels(self):
|
||||
return self.moe_parallel_config.use_flashinfer_cutlass_kernels
|
||||
"""
|
||||
Whether to use FlashInfer cutlass kernels for NVFP4 MoE.
|
||||
"""
|
||||
return (self.quant_config is not None
|
||||
and self.quant_config.quant_dtype == "nvfp4"
|
||||
and envs.VLLM_USE_FLASHINFER_MOE_FP4
|
||||
and has_flashinfer_cutlass_fused_moe()
|
||||
and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput")
|
||||
|
||||
@staticmethod
|
||||
def make(
|
||||
|
||||
@ -920,7 +920,7 @@ class FusedMoE(CustomOp):
|
||||
self.batched_router_logits: Optional[torch.Tensor] = None
|
||||
if (self.moe_parallel_config.use_pplx_kernels
|
||||
or self.moe_parallel_config.use_deepep_ll_kernels
|
||||
or self.moe_parallel_config.use_flashinfer_cutlass_kernels):
|
||||
or self.moe_config.use_flashinfer_cutlass_kernels):
|
||||
self.batched_hidden_states = torch.zeros(
|
||||
(moe.max_num_tokens, self.hidden_size),
|
||||
dtype=moe.in_dtype,
|
||||
@ -974,7 +974,7 @@ class FusedMoE(CustomOp):
|
||||
|
||||
@property
|
||||
def use_flashinfer_cutlass_kernels(self):
|
||||
return self.moe_parallel_config.use_flashinfer_cutlass_kernels
|
||||
return self.moe_config.use_flashinfer_cutlass_kernels
|
||||
|
||||
def update_expert_map(self):
|
||||
# ep_size and ep_rank should already be updated
|
||||
@ -1665,7 +1665,7 @@ class FusedMoE(CustomOp):
|
||||
# only when data parallelism (DP) is enabled.
|
||||
use_flashinfer_cutlass_kernels = (
|
||||
self.dp_size > 1
|
||||
and self.moe_parallel_config.use_flashinfer_cutlass_kernels)
|
||||
and self.moe_config.use_flashinfer_cutlass_kernels)
|
||||
if (self.moe_parallel_config.use_pplx_kernels
|
||||
or self.moe_parallel_config.use_deepep_ll_kernels
|
||||
or use_flashinfer_cutlass_kernels):
|
||||
@ -1674,7 +1674,7 @@ class FusedMoE(CustomOp):
|
||||
do_naive_dispatch_combine: bool = (
|
||||
self.dp_size > 1
|
||||
and not self.moe_parallel_config.use_deepep_ht_kernels
|
||||
and not self.moe_parallel_config.use_flashinfer_cutlass_kernels)
|
||||
and not self.moe_config.use_flashinfer_cutlass_kernels)
|
||||
if do_naive_dispatch_combine:
|
||||
hidden_states, router_logits = get_ep_group().dispatch(
|
||||
hidden_states, router_logits)
|
||||
|
||||
@ -35,6 +35,7 @@ logger = init_logger(__name__)
|
||||
|
||||
WEIGHT_LOADER_V2_SUPPORTED = [
|
||||
"CompressedTensorsLinearMethod",
|
||||
"CompressedTensorsLinearTransformMethod",
|
||||
"BitBLASLinearMethod",
|
||||
"GPTQBitBLASLinearMethod",
|
||||
"AWQMarlinLinearMethod",
|
||||
@ -199,6 +200,7 @@ class UnquantizedLinearMethod(LinearMethodBase):
|
||||
set_weight_attrs(weight, extra_weight_attrs)
|
||||
|
||||
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
||||
# special postprocessing for CPU SGL
|
||||
if current_platform.is_cpu() and envs.VLLM_CPU_SGL_KERNEL:
|
||||
from vllm.model_executor.layers.utils import check_cpu_sgl_kernel
|
||||
N, K = layer.weight.size()
|
||||
@ -1470,7 +1472,7 @@ class QKVCrossParallelLinear(LinearBase):
|
||||
self.bias = torch.nn.Parameter()
|
||||
set_weight_attrs(self.bias, {
|
||||
"output_dim": 0,
|
||||
"weight_loader": self.weight_loader,
|
||||
"weight_loader": self.weight_loader_v1,
|
||||
})
|
||||
else:
|
||||
self.bias = None
|
||||
@ -1580,6 +1582,18 @@ class QKVCrossParallelLinear(LinearBase):
|
||||
k, v = kv_enc.split(self.kv_size, dim=-1)
|
||||
return q, k, v
|
||||
|
||||
def weight_loader_v1(self,
|
||||
param: torch.nn.Parameter,
|
||||
loaded_weight: torch.Tensor,
|
||||
loaded_shard_id: Optional[str] = None):
|
||||
# just like all other parameters, does not yet
|
||||
# support loading bias with weight_loader_v2
|
||||
layer = (self.q_proj_decoder
|
||||
if loaded_shard_id == "q" else self.kv_proj_encoder)
|
||||
target_param = self.select_proj_params(layer, param)
|
||||
shard_id_args = (loaded_shard_id, ) if loaded_shard_id != "q" else ()
|
||||
layer.weight_loader(target_param, loaded_weight, *shard_id_args)
|
||||
|
||||
def weight_loader(self,
|
||||
param: torch.nn.Parameter,
|
||||
loaded_weight: torch.Tensor,
|
||||
|
||||
@ -11,6 +11,7 @@ from compressed_tensors.config import (CompressionFormat,
|
||||
from compressed_tensors.quantization import (QuantizationArgs,
|
||||
QuantizationStrategy,
|
||||
QuantizationType)
|
||||
from compressed_tensors.transform import TransformConfig
|
||||
from pydantic import BaseModel
|
||||
|
||||
import vllm.envs as envs
|
||||
@ -30,6 +31,8 @@ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
|
||||
CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
|
||||
CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
|
||||
CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.transform.linear import ( # noqa: E501
|
||||
CompressedTensorsLinearTransformMethod, get_linear_transform_schemes)
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
|
||||
find_matched_target, is_activation_quantization_format,
|
||||
should_ignore_layer)
|
||||
@ -60,6 +63,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
sparsity_ignore_list: list[str],
|
||||
kv_cache_scheme: Optional[dict[str, Any]] = None,
|
||||
config: Optional[dict[str, Any]] = None,
|
||||
transform_config: Optional[TransformConfig] = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.ignore = ignore
|
||||
@ -71,6 +75,12 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
self.sparsity_ignore_list = sparsity_ignore_list
|
||||
self.config = config
|
||||
|
||||
if transform_config is not None:
|
||||
self.transform_config = TransformConfig.model_validate(
|
||||
transform_config)
|
||||
else:
|
||||
self.transform_config = None
|
||||
|
||||
def get_linear_method(self) -> "CompressedTensorsLinearMethod":
|
||||
return CompressedTensorsLinearMethod(self)
|
||||
|
||||
@ -103,18 +113,27 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
) -> Optional["QuantizeMethodBase"]:
|
||||
from vllm.attention.layer import Attention # Avoid circular import
|
||||
|
||||
# Check if the layer is skipped for quantization.
|
||||
# TODO (@robertgshaw2): support module names
|
||||
if should_ignore_layer(prefix,
|
||||
ignore=self.ignore,
|
||||
fused_mapping=self.packed_modules_mapping):
|
||||
return UnquantizedLinearMethod()
|
||||
if isinstance(layer, LinearBase):
|
||||
scheme = self.get_scheme(layer=layer, layer_name=prefix)
|
||||
if scheme is None:
|
||||
return UnquantizedLinearMethod()
|
||||
layer.scheme = scheme
|
||||
return CompressedTensorsLinearMethod(self)
|
||||
# collect schemes
|
||||
quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
|
||||
input_tfms, output_tfms = get_linear_transform_schemes(
|
||||
layer, prefix, self.transform_config,
|
||||
self.packed_modules_mapping)
|
||||
|
||||
# choose quantization method
|
||||
quant_method: LinearMethodBase = UnquantizedLinearMethod()
|
||||
if quant_scheme is not None:
|
||||
layer.scheme = quant_scheme
|
||||
quant_method = CompressedTensorsLinearMethod(self)
|
||||
|
||||
# choose transform method
|
||||
if any((input_tfms, output_tfms)):
|
||||
return CompressedTensorsLinearTransformMethod.from_schemes(
|
||||
quant_method, input_tfms, output_tfms)
|
||||
|
||||
else:
|
||||
return quant_method
|
||||
|
||||
if isinstance(layer, Attention):
|
||||
return CompressedTensorsKVCacheMethod(self)
|
||||
if isinstance(layer, FusedMoE):
|
||||
@ -129,6 +148,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
config=config)
|
||||
sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config(
|
||||
config=config)
|
||||
transform_config = config.get("transform_config")
|
||||
|
||||
return cls(
|
||||
target_scheme_map=target_scheme_map,
|
||||
@ -137,6 +157,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
sparsity_scheme_map=sparsity_scheme_map,
|
||||
sparsity_ignore_list=sparsity_ignore_list,
|
||||
config=config,
|
||||
transform_config=transform_config,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@ -537,9 +558,11 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
|
||||
# Find the "target" in the compressed-tensors config
|
||||
# that our layer conforms to.
|
||||
# TODO (@robertgshaw): add compressed-tensors as dep
|
||||
# so we do not have to re-write these functions
|
||||
# need to make accelerate optional in ct to do this
|
||||
# TODO (@kylesayrs): support ignore module names with ct matching utils
|
||||
if should_ignore_layer(layer_name,
|
||||
ignore=self.ignore,
|
||||
fused_mapping=self.packed_modules_mapping):
|
||||
return None
|
||||
|
||||
# Will be empty for models with only sparsity
|
||||
weight_quant = input_quant = None
|
||||
@ -556,7 +579,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
format = scheme_dict.get("format")
|
||||
|
||||
# Find the sparsity scheme of the layer
|
||||
# assume that fused layers inerhit first component's sparsity scheme
|
||||
# assume that fused layers inherit first component's sparsity scheme
|
||||
sparsity_targets = (self.sparsity_scheme_map.keys() -
|
||||
set(self.sparsity_ignore_list))
|
||||
sparsity_scheme: Optional[SparsityCompressionConfig] = None
|
||||
@ -722,7 +745,6 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
|
||||
layer input. See LinearMethodBase for param details
|
||||
|
||||
"""
|
||||
|
||||
scheme = layer.scheme
|
||||
if scheme is None:
|
||||
raise ValueError("A scheme must be defined for each layer")
|
||||
|
||||
@ -71,7 +71,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
|
||||
) -> "CompressedTensorsMoEMethod":
|
||||
# TODO: @dsikka: refactor this to use schemes as other kernels
|
||||
# are supported + check if the layer is being ignored.
|
||||
# Check if a using "Linear" to select scheems
|
||||
# Check if a using "Linear" to select schemes
|
||||
if "Linear" in quant_config.target_scheme_map:
|
||||
matched_target = "Linear"
|
||||
else:
|
||||
|
||||
@ -0,0 +1,227 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Generator
|
||||
from itertools import accumulate
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
from compressed_tensors.transform import (TransformArgs, TransformConfig,
|
||||
TransformLocation, TransformScheme)
|
||||
from compressed_tensors.utils import is_match
|
||||
|
||||
from vllm.model_executor.layers.linear import (WEIGHT_LOADER_V2_SUPPORTED,
|
||||
LinearMethodBase,
|
||||
QKVCrossParallelLinear)
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.transform.module import ( # noqa: E501
|
||||
HadamardTransform)
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import ( # noqa: E501
|
||||
TransformTuple)
|
||||
|
||||
|
||||
class CompressedTensorsLinearTransformMethod(LinearMethodBase):
|
||||
"""
|
||||
Wraps `CompressedTensorsLinearMethod` or `UnquantizedLinearMethod` and adds
|
||||
input and output transforms to either side of the original apply method
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def from_schemes(
|
||||
cls, quant_method: LinearMethodBase, input_tfms: dict[int,
|
||||
TransformTuple],
|
||||
output_tfms: dict[int, TransformTuple]
|
||||
) -> "CompressedTensorsLinearTransformMethod":
|
||||
assert input_tfms or output_tfms
|
||||
|
||||
# TODO (@ksayers): implement QutlassLinearMethodNvFP4
|
||||
# hadacore and fwht can be selected by Transform module
|
||||
|
||||
return cls(quant_method, input_tfms, output_tfms)
|
||||
|
||||
def __init__(self, quant_method: LinearMethodBase,
|
||||
input_tfms: dict[int, TransformTuple],
|
||||
output_tfms: dict[int, TransformTuple]):
|
||||
self.quant_method = quant_method
|
||||
self.input_tfms = input_tfms
|
||||
self.output_tfms = output_tfms
|
||||
|
||||
self.input_transform: Optional[HadamardTransform] = None
|
||||
self.output_transform: Optional[HadamardTransform] = None
|
||||
|
||||
def create_weights(self, layer: torch.nn.Module,
|
||||
input_size_per_partition: int,
|
||||
output_partition_sizes: list[int], input_size: int,
|
||||
output_size: int, params_dtype: torch.dtype,
|
||||
**extra_weight_attrs):
|
||||
|
||||
# get weight loader for transforms
|
||||
weight_loader: Callable = extra_weight_attrs.get(
|
||||
"weight_loader") # type: ignore[assignment]
|
||||
|
||||
# HACK: UnquantizedLinearMethod does not support weight loader v2, but
|
||||
# transforms (specifically SharedWeightParameter) requires
|
||||
# weight loader v2. Until UnquantizedLinearMethod supports v2, we must
|
||||
# hack around this by getting weight loader v1 so ULM can load correctly
|
||||
quant_method_name = self.quant_method.__class__.__name__
|
||||
if quant_method_name not in WEIGHT_LOADER_V2_SUPPORTED:
|
||||
if isinstance(layer, QKVCrossParallelLinear):
|
||||
weight_loader_v1 = layer.weight_loader_v1
|
||||
else:
|
||||
weight_loader_v1 = layer.weight_loader
|
||||
extra_weight_attrs["weight_loader"] = weight_loader_v1
|
||||
|
||||
self.quant_method.create_weights(
|
||||
layer=layer,
|
||||
input_size_per_partition=input_size_per_partition,
|
||||
output_partition_sizes=output_partition_sizes,
|
||||
input_size=input_size,
|
||||
output_size=output_size,
|
||||
params_dtype=params_dtype,
|
||||
**extra_weight_attrs)
|
||||
|
||||
# validate schemes
|
||||
num_partitions = len(output_partition_sizes)
|
||||
self._validate_tfm_schemes(num_partitions)
|
||||
|
||||
# create submodules for weight loading
|
||||
if len(self.input_tfms) > 0:
|
||||
scheme_name = list(self.input_tfms.values())[0].scheme_name
|
||||
location = list(self.input_tfms.values())[0].args.location
|
||||
transform_name = f"{scheme_name}_{location}"
|
||||
|
||||
transform = HadamardTransform(self.input_tfms, layer,
|
||||
weight_loader,
|
||||
input_size_per_partition,
|
||||
output_partition_sizes)
|
||||
layer.register_module(transform_name, transform)
|
||||
self.input_transform = transform
|
||||
|
||||
if len(self.output_tfms) > 0:
|
||||
scheme_name = list(self.output_tfms.values())[0].scheme_name
|
||||
location = list(self.output_tfms.values())[0].args.location
|
||||
transform_name = f"{scheme_name}_{location}"
|
||||
|
||||
transform = HadamardTransform(self.output_tfms, layer,
|
||||
weight_loader,
|
||||
input_size_per_partition,
|
||||
output_partition_sizes)
|
||||
layer.register_module(transform_name, transform)
|
||||
self.output_transform = transform
|
||||
|
||||
# compute partition ranges for slicing activations
|
||||
starts = [0] + list(accumulate(output_partition_sizes))[:-1]
|
||||
self.partition_ranges = list(zip(starts, output_partition_sizes))
|
||||
|
||||
def process_weights_after_loading(self, layer):
|
||||
self.quant_method.process_weights_after_loading(layer)
|
||||
|
||||
for submodule in layer.children():
|
||||
if isinstance(submodule, HadamardTransform):
|
||||
submodule.process_weights_after_loading()
|
||||
|
||||
def apply(self,
|
||||
layer: torch.nn.Module,
|
||||
x: torch.Tensor,
|
||||
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||
|
||||
if self.input_transform is not None:
|
||||
x = self.input_transform(x)
|
||||
|
||||
assert bias is None
|
||||
x = self.quant_method.apply(layer, x, bias)
|
||||
|
||||
# TODO (@ksayers): Write a triton kernel to do this in parallel
|
||||
if self.output_transform is not None:
|
||||
for part_id, (start, length) in enumerate(self.partition_ranges):
|
||||
x[:, start:start + length] = self.output_transform(
|
||||
x[:, start:start + length], part_id=part_id)
|
||||
|
||||
return x
|
||||
|
||||
def _validate_tfm_schemes(self, num_partitions: int):
|
||||
if len(self.input_tfms) > 0:
|
||||
if 0 not in self.input_tfms:
|
||||
raise ValueError("Must have same input")
|
||||
|
||||
for part_index in range(num_partitions):
|
||||
if self.input_tfms[part_index] != self.input_tfms[0]:
|
||||
raise ValueError("Must have same input")
|
||||
|
||||
if len(self.output_tfms) > 0:
|
||||
scheme_name = list(self.output_tfms.values())[0].scheme_name
|
||||
location = list(self.output_tfms.values())[0].args.location
|
||||
|
||||
for tfm in self.output_tfms.values():
|
||||
if tfm.scheme_name != scheme_name:
|
||||
raise ValueError("Must have same scheme name")
|
||||
if tfm.args.location != location:
|
||||
raise ValueError("Must have same location")
|
||||
|
||||
return self.input_tfms, self.output_tfms
|
||||
|
||||
|
||||
def get_linear_transform_schemes(
|
||||
layer: torch.nn.Module, layer_name: str,
|
||||
transform_config: Optional[TransformConfig],
|
||||
packed_modules_mapping: dict[str, list[str]]
|
||||
) -> tuple[dict[int, TransformTuple], dict[
|
||||
int, TransformTuple]]: # [input_transform, [output_transform, ...]]
|
||||
# there can only be one transform input scheme per (fused) module
|
||||
input_tfms = {}
|
||||
output_tfms = {}
|
||||
|
||||
partition_names = get_layer_partition_names(layer_name,
|
||||
packed_modules_mapping)
|
||||
|
||||
for scheme_name, scheme, args in get_schemes_args(transform_config):
|
||||
for part_index, part_name in enumerate(partition_names):
|
||||
if is_match(part_name, layer, args.targets,
|
||||
args.ignore) and args.is_online():
|
||||
if args.location == TransformLocation.INPUT:
|
||||
input_tfms[part_index] = TransformTuple(
|
||||
scheme_name, scheme, args)
|
||||
|
||||
elif args.location == TransformLocation.OUTPUT:
|
||||
output_tfms[part_index] = TransformTuple(
|
||||
scheme_name, scheme, args)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Cannot apply `{args.location}` "
|
||||
f"transform to `{layer_name}`")
|
||||
|
||||
return (input_tfms, output_tfms)
|
||||
|
||||
|
||||
def get_schemes_args(
|
||||
transform_config: Optional[TransformConfig]
|
||||
) -> Generator[tuple[str, TransformScheme, TransformArgs]]:
|
||||
if transform_config is None:
|
||||
return
|
||||
|
||||
for scheme_name, scheme in transform_config.config_groups.items():
|
||||
for args in scheme.apply:
|
||||
yield (scheme_name, scheme, args)
|
||||
|
||||
|
||||
def get_layer_partition_names(
|
||||
layer_name: str, packed_modules_mapping: dict[str,
|
||||
list[str]]) -> list[str]:
|
||||
"""
|
||||
Get all partition names associated with this layer.
|
||||
Names are returned in order of their partition indices.
|
||||
|
||||
```python
|
||||
mapping = {"gate_up_proj", "gate_proj", "up_proj"}
|
||||
|
||||
assert get_layer_partition_names(
|
||||
"mlp.gate_up_proj", mapping) == ["gate_proj", "up_proj"]
|
||||
assert get_layer_partition_names(
|
||||
"mlp.down_proj", mapping) == ["down_proj"]
|
||||
"""
|
||||
for fused_suffix, part_suffixes in packed_modules_mapping.items():
|
||||
if layer_name.endswith(fused_suffix):
|
||||
return [
|
||||
layer_name.removesuffix(fused_suffix) + part_suffix
|
||||
for part_suffix in part_suffixes
|
||||
]
|
||||
|
||||
return [layer_name]
|
||||
@ -0,0 +1,135 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import math
|
||||
from collections.abc import Hashable
|
||||
from typing import Callable, Optional
|
||||
|
||||
import torch
|
||||
from compressed_tensors.transform import TransformLocation, TransformScheme
|
||||
from torch import Tensor
|
||||
|
||||
from vllm.distributed.parallel_state import (
|
||||
get_tensor_model_parallel_world_size)
|
||||
from vllm.model_executor.layers.linear import LinearBase
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import ( # noqa: E501
|
||||
TransformTuple)
|
||||
from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
VocabParallelEmbedding)
|
||||
from vllm.model_executor.parameter import SharedWeightParameter
|
||||
|
||||
|
||||
class HadamardTransform(torch.nn.Module):
|
||||
"""
|
||||
Class which handles weight loading, postprocessing, and application of
|
||||
transforms. Meant to be used with `CompressedTensorsLinearTransformMethod`
|
||||
and attention transforms method (not implemented yet)
|
||||
"""
|
||||
transforms: dict[int, TransformTuple] # info parsed from transforms config
|
||||
weight: SharedWeightParameter # container for shared tensors
|
||||
|
||||
kernel: Callable # function used during application
|
||||
scales: dict[int, float] # hadamard scale, usually sqrt(matrix.size(0))
|
||||
|
||||
def __init__(self,
|
||||
transforms: dict[int, TransformTuple],
|
||||
layer: torch.nn.Module,
|
||||
weight_loader: Callable,
|
||||
input_size_per_partition: int,
|
||||
output_partition_sizes: list[int],
|
||||
kernel: Optional[Callable] = None):
|
||||
super().__init__()
|
||||
self.transforms = transforms
|
||||
self.scales = {}
|
||||
|
||||
if get_tensor_model_parallel_world_size() > 1:
|
||||
raise NotImplementedError("Online transforms with tensor "
|
||||
"parallelism is not supported")
|
||||
|
||||
# Similar to row/col parallel params, but tensors are separate
|
||||
# to allow for loading with shared memory
|
||||
self.weight = SharedWeightParameter(weight_loader=weight_loader)
|
||||
|
||||
# create shared partition data for each partition of the original weight
|
||||
input_size = input_size_per_partition
|
||||
for part_index, (_scheme_name, scheme,
|
||||
args) in self.transforms.items():
|
||||
output_size = output_partition_sizes[part_index]
|
||||
weight_size = self._get_weight_size(layer, args.location,
|
||||
input_size, output_size)
|
||||
|
||||
data_key = self._get_data_key(scheme, weight_size)
|
||||
self.weight.add_partition(
|
||||
part_index,
|
||||
data_key,
|
||||
size=(weight_size, weight_size),
|
||||
dtype=scheme.precision,
|
||||
)
|
||||
|
||||
# validate that shared tensors and schemes are correct
|
||||
self._validate_input_transforms()
|
||||
|
||||
# select kernel based on transform schemes
|
||||
self.kernel = self._infer_kernel() if kernel is None else kernel
|
||||
|
||||
def process_weights_after_loading(self):
|
||||
for part_id in self.weight.partitions:
|
||||
data = self.weight.partitions[part_id].data
|
||||
|
||||
# required by torch.compile
|
||||
self.weight.process_weights_after_loading()
|
||||
|
||||
# precompute scale as a runtime multiply, not division
|
||||
# do not fold into weight in order to utilize FWHT
|
||||
self.scales[part_id] = 1 / math.sqrt(data.size(0))
|
||||
|
||||
# FUTURE: avoid runtime tranpose by processing weights
|
||||
# prior to apply
|
||||
|
||||
def forward(self, value: Tensor, part_id: int = 0) -> Tensor:
|
||||
if part_id not in self.weight.partitions:
|
||||
return value
|
||||
|
||||
weight = self.weight.partitions[part_id]
|
||||
weight = weight if self.transforms[
|
||||
part_id].args.inverse else weight.T # linear := x(W.T)
|
||||
scale = self.scales[part_id]
|
||||
return self.kernel(self, value.to(weight.dtype), weight, None).to(
|
||||
value.dtype) * scale
|
||||
|
||||
def _get_data_key(self, scheme: TransformScheme,
|
||||
weight_size: int) -> Hashable:
|
||||
return (id(scheme), weight_size)
|
||||
|
||||
def _get_weight_size(self, layer: torch.nn.Module,
|
||||
location: TransformLocation, input_size: int,
|
||||
output_size: int) -> int:
|
||||
if isinstance(layer, LinearBase):
|
||||
if location == TransformLocation.INPUT:
|
||||
return input_size
|
||||
|
||||
elif location == TransformLocation.OUTPUT:
|
||||
return output_size
|
||||
|
||||
elif isinstance(layer, VocabParallelEmbedding):
|
||||
if location == TransformLocation.INPUT:
|
||||
return output_size
|
||||
|
||||
elif location == TransformLocation.OUTPUT:
|
||||
return input_size
|
||||
|
||||
raise ValueError()
|
||||
|
||||
def _validate_input_transforms(self):
|
||||
assert len(self.transforms) > 0
|
||||
location = list(self.transforms.values())[0].args.location
|
||||
|
||||
if location == TransformLocation.INPUT:
|
||||
first_data = self.weight.partitions[0].data
|
||||
for partition in self.weight.partitions.values():
|
||||
if partition.data.data_ptr() != first_data.data_ptr():
|
||||
raise ValueError("")
|
||||
|
||||
def _infer_kernel(self) -> Callable:
|
||||
# TODO (@ksayers): use fwht, hadacore
|
||||
return dispatch_unquantized_gemm()
|
||||
@ -0,0 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.transform.linear import ( # noqa: E501
|
||||
CompressedTensorsLinearTransformMethod)
|
||||
|
||||
|
||||
# Because qutlass fuses hadamard with quantization, it cannot automatically be
|
||||
# composed with kernels in the way CompressedTensorsLinearTransformMethod does.
|
||||
# Therefore, a separate scheme must be created for each quantized dtype
|
||||
class QutlassLinearMethodNvFP4(CompressedTensorsLinearTransformMethod):
|
||||
|
||||
def apply(self,
|
||||
layer: torch.nn.Module,
|
||||
x: torch.Tensor,
|
||||
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||
# fused hadamard quant linear method
|
||||
raise NotImplementedError()
|
||||
@ -0,0 +1,13 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import NamedTuple
|
||||
|
||||
from compressed_tensors.transform import TransformArgs, TransformScheme
|
||||
|
||||
__all__ = ["TransformTuple"]
|
||||
|
||||
|
||||
class TransformTuple(NamedTuple):
|
||||
scheme_name: str
|
||||
scheme: TransformScheme
|
||||
args: TransformArgs
|
||||
@ -124,7 +124,7 @@ class MoeWNA16Config(QuantizationConfig):
|
||||
awq_min_capability = AWQConfig.get_min_capability()
|
||||
|
||||
gptq_compatible = quant_method == "gptq" and \
|
||||
not desc_act and num_bits in [4, 8]
|
||||
not desc_act and num_bits in [4, 8]
|
||||
awq_compatible = quant_method == "awq" and num_bits == 4 and \
|
||||
device_capability >= awq_min_capability
|
||||
|
||||
@ -175,11 +175,8 @@ class MoeWNA16Method(FusedMoEMethodBase):
|
||||
quant_config: The MOE WNA16 (W8A16/W4A16) quantization config.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
quant_config: MoeWNA16Config,
|
||||
moe: FusedMoEConfig,
|
||||
):
|
||||
def __init__(self, quant_config: MoeWNA16Config,
|
||||
moe: "FusedMoEConfig") -> None:
|
||||
super().__init__(moe)
|
||||
self.quant_config = quant_config
|
||||
|
||||
@ -187,6 +184,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
|
||||
hidden_size: int, intermediate_size_per_partition: int,
|
||||
params_dtype: torch.dtype, **extra_weight_attrs):
|
||||
|
||||
self.moe = layer
|
||||
layer.quant_config = self.quant_config
|
||||
bit8_pack_factor = self.quant_config.bit8_pack_factor
|
||||
group_size = self.quant_config.group_size
|
||||
@ -308,7 +306,6 @@ class MoeWNA16Method(FusedMoEMethodBase):
|
||||
logical_replica_count: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
assert self.fused_experts is None
|
||||
|
||||
if enable_eplb:
|
||||
raise NotImplementedError(
|
||||
"EPLB not supported for `MoeWNA16Method` yet.")
|
||||
@ -404,12 +401,14 @@ class MoeWNA16Method(FusedMoEMethodBase):
|
||||
|
||||
def moe_wna16_weight_loader(param: torch.nn.Parameter,
|
||||
loaded_weight: torch.Tensor,
|
||||
weight_name: str, shard_id: str,
|
||||
expert_id: int):
|
||||
weight_name: str,
|
||||
shard_id: str,
|
||||
expert_id: int,
|
||||
return_success: bool = False):
|
||||
if "g_idx" in weight_name:
|
||||
return
|
||||
return False if return_success else None
|
||||
if not layer.quant_config.has_zp and "qzeros" in weight_name:
|
||||
return
|
||||
return False if return_success else None
|
||||
|
||||
device = get_tp_group().device
|
||||
tp_rank = get_tensor_model_parallel_rank()
|
||||
@ -455,11 +454,18 @@ class MoeWNA16Method(FusedMoEMethodBase):
|
||||
param.data[expert_id, :shard_size // 2] = tensor
|
||||
else:
|
||||
param.data[expert_id, shard_size // 2:] = tensor
|
||||
return True if return_success else None
|
||||
elif "w2_qzeros" in weight_name:
|
||||
param.data[expert_id] = loaded_weight.view(
|
||||
loaded_weight.size(0), layer.tp_size, -1)[:, tp_rank]
|
||||
return True if return_success else None
|
||||
else:
|
||||
weight_loader(param, loaded_weight, weight_name, shard_id,
|
||||
expert_id)
|
||||
# Delegate to the original loader, passing return_success
|
||||
return weight_loader(param,
|
||||
loaded_weight,
|
||||
weight_name,
|
||||
shard_id,
|
||||
expert_id,
|
||||
return_success=return_success)
|
||||
|
||||
return moe_wna16_weight_loader
|
||||
|
||||
@ -623,8 +623,6 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
|
||||
if should_use_flashinfer_mxfp4():
|
||||
from flashinfer import mxfp8_quantize, trtllm_fp4_block_scale_moe
|
||||
assert not self.moe.use_ep, (
|
||||
"EP is not supported for flashinfer mxfp4 moe backend yet.")
|
||||
if _should_use_flashinfer_mxfp4_bf16():
|
||||
assert x.dtype == torch.bfloat16
|
||||
x_quant = x
|
||||
@ -650,12 +648,12 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
None, # output1_scale_scalar
|
||||
None, # output1_scale_gate_scalar
|
||||
None, # output2_scale_scalar
|
||||
self.num_experts,
|
||||
global_num_experts,
|
||||
top_k,
|
||||
None, # n_group
|
||||
None, # topk_group
|
||||
self.intermediate_size, # padded to multiple of 256
|
||||
0, # local_expert_offset
|
||||
layer.ep_rank * layer.local_num_experts, # local_expert_offset
|
||||
self.num_experts, # local num experts
|
||||
None,
|
||||
self._get_tile_tokens_dim(x, top_k),
|
||||
|
||||
@ -27,12 +27,15 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
VocabParallelEmbedding)
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models.utils import WeightsMapper
|
||||
from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
|
||||
maybe_prefix)
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
from .interfaces import SupportsQuant
|
||||
from ..layers.pooler import ClassifierPooler, DispatchPooler, Pooler
|
||||
from .bert import BertPooler
|
||||
from .interfaces import SupportsCrossEncoding, SupportsQuant
|
||||
from .interfaces_base import default_pooling_type
|
||||
|
||||
|
||||
@ -406,9 +409,14 @@ class BertWithRopeEncoder(nn.Module):
|
||||
class BertWithRope(nn.Module, SupportsQuant):
|
||||
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
def __init__(self,
|
||||
*,
|
||||
vllm_config: VllmConfig,
|
||||
prefix: str = "",
|
||||
add_pooling_layer: bool = False):
|
||||
super().__init__()
|
||||
self.vllm_config = vllm_config
|
||||
self.add_pooling_layer = add_pooling_layer
|
||||
self.config = vllm_config.model_config.hf_config
|
||||
self.embeddings = BertWithRopeEmbedding(self.config)
|
||||
self.encoder = BertWithRopeEncoder(
|
||||
@ -416,6 +424,7 @@ class BertWithRope(nn.Module, SupportsQuant):
|
||||
bias=getattr(self.config, "bias", True),
|
||||
rotary_kwargs=self.config.rotary_kwargs,
|
||||
prefix=f"{prefix}.encoder")
|
||||
self.pooler = BertPooler(self.config) if add_pooling_layer else None
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -448,7 +457,7 @@ class BertWithRope(nn.Module, SupportsQuant):
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "pooler" in name:
|
||||
if not self.add_pooling_layer and "pooler" in name:
|
||||
continue
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
@ -508,8 +517,8 @@ class GteNewModel(BertWithRope):
|
||||
"attention.o_proj": "attn.out_proj",
|
||||
})
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs):
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
|
||||
|
||||
# GteNewModel only gate_up_proj does not have bias.
|
||||
# Hack method learned from vllm/model_executor/models/glm.py
|
||||
@ -614,3 +623,65 @@ class JinaRobertaModel(BertWithRope):
|
||||
torch.Tensor]]) -> set[str]:
|
||||
weights = self.jina_merge_lora_weights(weights)
|
||||
return super().load_weights(weights)
|
||||
|
||||
|
||||
@default_pooling_type("CLS")
|
||||
class GteNewForSequenceClassification(nn.Module, SupportsCrossEncoding):
|
||||
is_pooling_model = True
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
|
||||
self.new = GteNewModel(vllm_config=vllm_config,
|
||||
prefix=prefix,
|
||||
add_pooling_layer=True)
|
||||
self.classifier = RowParallelLinear(config.hidden_size,
|
||||
config.num_labels,
|
||||
input_is_parallel=False,
|
||||
bias=True,
|
||||
quant_config=quant_config,
|
||||
prefix=maybe_prefix(
|
||||
prefix, "classifier"),
|
||||
return_bias=False)
|
||||
|
||||
pooler_config = vllm_config.model_config.pooler_config
|
||||
assert pooler_config is not None
|
||||
|
||||
self.pooler = DispatchPooler({
|
||||
"encode":
|
||||
Pooler.for_encode(pooler_config),
|
||||
"classify":
|
||||
ClassifierPooler(
|
||||
pooling=self.new.pooler,
|
||||
classifier=self.classifier,
|
||||
act_fn=ClassifierPooler.act_fn_for_seq_cls(
|
||||
vllm_config.model_config),
|
||||
),
|
||||
"score":
|
||||
ClassifierPooler(
|
||||
pooling=self.new.pooler,
|
||||
classifier=self.classifier,
|
||||
act_fn=ClassifierPooler.act_fn_for_cross_encoder(
|
||||
vllm_config.model_config),
|
||||
),
|
||||
})
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
loader = AutoWeightsLoader(self)
|
||||
loaded_params = loader.load_weights(weights)
|
||||
return loaded_params
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor],
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
|
||||
return self.new(input_ids=input_ids,
|
||||
positions=positions,
|
||||
inputs_embeds=inputs_embeds,
|
||||
intermediate_tensors=intermediate_tensors)
|
||||
|
||||
@ -406,6 +406,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
|
||||
MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
|
||||
"GteModel": SnowflakeGteNewModelConfig,
|
||||
"GteNewModel": GteNewModelConfig,
|
||||
"GteNewForSequenceClassification": GteNewModelConfig,
|
||||
"NomicBertModel": NomicBertModelConfig,
|
||||
"Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
|
||||
"Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
|
||||
|
||||
@ -11,7 +11,8 @@ from transformers import GptOssConfig
|
||||
from vllm.attention import Attention, AttentionType
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import CacheConfig, VllmConfig
|
||||
from vllm.distributed import (get_ep_group, get_tensor_model_parallel_rank,
|
||||
from vllm.distributed import (get_ep_group, get_pp_group,
|
||||
get_tensor_model_parallel_rank,
|
||||
get_tensor_model_parallel_world_size)
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
@ -27,7 +28,10 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils import cdiv
|
||||
|
||||
from .interfaces import SupportsPP
|
||||
from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index,
|
||||
is_pp_missing_parameter,
|
||||
make_empty_intermediate_tensors_factory, make_layers,
|
||||
maybe_prefix)
|
||||
|
||||
|
||||
@ -75,8 +79,6 @@ class OAIAttention(nn.Module):
|
||||
dtype=torch.bfloat16,
|
||||
requires_grad=False))
|
||||
|
||||
self.norm = RMSNorm(config.hidden_size, eps=1e-5)
|
||||
|
||||
self.q_size = self.num_attention_heads * self.head_dim // tp_size
|
||||
self.kv_size = self.num_key_value_heads * self.head_dim // tp_size
|
||||
self.scaling = self.head_dim**-0.5
|
||||
@ -119,16 +121,13 @@ class OAIAttention(nn.Module):
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor,
|
||||
positions: torch.Tensor) -> torch.Tensor:
|
||||
t = self.norm(hidden_states)
|
||||
|
||||
qkv, _ = self.qkv(t)
|
||||
qkv, _ = self.qkv(hidden_states)
|
||||
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
||||
q, k = self.rotary_emb(positions, q, k)
|
||||
v = v.contiguous()
|
||||
attn_output = self.attn(q, k, v)
|
||||
output, _ = self.o_proj(attn_output)
|
||||
|
||||
return output + hidden_states
|
||||
return output
|
||||
|
||||
|
||||
class MLPBlock(torch.nn.Module):
|
||||
@ -145,7 +144,6 @@ class MLPBlock(torch.nn.Module):
|
||||
self.num_experts = config.num_local_experts
|
||||
self.experts_per_token = config.num_experts_per_tok
|
||||
self.world_size = dist.get_world_size() if dist.is_initialized() else 1
|
||||
self.norm = RMSNorm(config.hidden_size, eps=1e-5)
|
||||
self.router = torch.nn.Linear(config.hidden_size,
|
||||
config.num_local_experts,
|
||||
dtype=torch.bfloat16)
|
||||
@ -163,10 +161,9 @@ class MLPBlock(torch.nn.Module):
|
||||
activation="swigluoai")
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
t = self.norm(x)
|
||||
g = self.router(t)
|
||||
t = self.experts(hidden_states=t, router_logits=g)
|
||||
return x + t
|
||||
g = self.router(x)
|
||||
x = self.experts(hidden_states=x, router_logits=g)
|
||||
return x
|
||||
|
||||
|
||||
class TransformerBlock(torch.nn.Module):
|
||||
@ -187,12 +184,28 @@ class TransformerBlock(torch.nn.Module):
|
||||
self.layer_idx,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.mlp")
|
||||
self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
|
||||
self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor,
|
||||
positions: torch.Tensor) -> torch.Tensor:
|
||||
attn_output = self.attn(hidden_states, positions)
|
||||
output = self.mlp(attn_output)
|
||||
return output
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> torch.Tensor:
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
else:
|
||||
hidden_states, residual = self.input_layernorm(
|
||||
hidden_states, residual)
|
||||
hidden_states = self.attn(hidden_states, positions)
|
||||
# Fully Connected
|
||||
hidden_states, residual = self.post_attention_layernorm(
|
||||
hidden_states, residual)
|
||||
output = self.mlp(hidden_states)
|
||||
return output, residual
|
||||
|
||||
|
||||
@support_torch_compile
|
||||
@ -214,22 +227,52 @@ class GptOssModel(nn.Module):
|
||||
self.config.vocab_size,
|
||||
self.config.hidden_size,
|
||||
)
|
||||
self.layers = torch.nn.ModuleList([
|
||||
TransformerBlock(
|
||||
self.start_layer, self.end_layer, self.layers = make_layers(
|
||||
self.config.num_hidden_layers,
|
||||
lambda prefix: TransformerBlock(
|
||||
self.config,
|
||||
cache_config=self.cache_config,
|
||||
quant_config=self.quant_config,
|
||||
prefix=maybe_prefix(prefix, f"block.{layer_idx}"),
|
||||
) for layer_idx in range(self.config.num_hidden_layers)
|
||||
])
|
||||
prefix=prefix,
|
||||
),
|
||||
prefix=f"{prefix}.layers",
|
||||
)
|
||||
self.norm = RMSNorm(self.config.hidden_size, eps=1e-5)
|
||||
self.make_empty_intermediate_tensors = (
|
||||
make_empty_intermediate_tensors_factory(
|
||||
["hidden_states", "residual"], self.config.hidden_size))
|
||||
|
||||
def forward(self, input_ids: torch.Tensor,
|
||||
positions: torch.Tensor) -> torch.Tensor:
|
||||
x = self.embedding(input_ids)
|
||||
for layer in self.layers:
|
||||
x = layer(x, positions)
|
||||
x = self.norm(x)
|
||||
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
|
||||
return self.embedding(input_ids)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
if get_pp_group().is_first_rank:
|
||||
if inputs_embeds is not None:
|
||||
x = inputs_embeds
|
||||
else:
|
||||
x = self.get_input_embeddings(input_ids)
|
||||
|
||||
residual = None
|
||||
else:
|
||||
assert intermediate_tensors is not None
|
||||
x = intermediate_tensors["hidden_states"]
|
||||
residual = intermediate_tensors["residual"]
|
||||
|
||||
for i in range(self.start_layer, self.end_layer):
|
||||
layer = self.layers[i]
|
||||
x, residual = layer(x, positions, residual)
|
||||
if not get_pp_group().is_last_rank:
|
||||
return IntermediateTensors({
|
||||
"hidden_states": x,
|
||||
"residual": residual
|
||||
})
|
||||
x, _ = self.norm(x, residual)
|
||||
return x
|
||||
|
||||
def _load_weights_mxfp4(
|
||||
@ -264,6 +307,10 @@ class GptOssModel(nn.Module):
|
||||
intermediate_size)
|
||||
|
||||
for name, weight in weights:
|
||||
# Skip layers on other devices.
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
|
||||
# FIXME(woosuk): Remove this after testing.
|
||||
weight = weight.cuda()
|
||||
|
||||
@ -445,6 +492,10 @@ class GptOssModel(nn.Module):
|
||||
intermediate_size)
|
||||
|
||||
for name, weight in weights:
|
||||
# Skip layers on other devices.
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
|
||||
if ".w13_weight" in name:
|
||||
# Handle MLP gate and up projection weights
|
||||
# Extract gate and up projection parts
|
||||
@ -562,18 +613,15 @@ class GptOssModel(nn.Module):
|
||||
weights, stacked_params_mapping)
|
||||
|
||||
|
||||
class GptOssForCausalLM(nn.Module):
|
||||
class GptOssForCausalLM(nn.Module, SupportsPP):
|
||||
packed_modules_mapping = {"qkv": ["q_proj", "k_proj", "v_proj"]}
|
||||
|
||||
hf_to_vllm_mapper = WeightsMapper(
|
||||
orig_to_new_substr={
|
||||
".self_attn.": ".attn.",
|
||||
".post_attention_layernorm.": ".mlp.norm.",
|
||||
},
|
||||
orig_to_new_suffix={
|
||||
".embed_tokens.weight": ".embedding.weight",
|
||||
".input_layernorm.weight": ".attn.norm.weight",
|
||||
".post_attention_layernorm.weight": ".mlp.norm.weight",
|
||||
|
||||
# MoE MXFP4 weights
|
||||
".gate_up_proj_blocks": ".w13_weight",
|
||||
@ -609,15 +657,19 @@ class GptOssForCausalLM(nn.Module):
|
||||
self.config.hidden_size,
|
||||
)
|
||||
self.logits_processor = LogitsProcessor(self.config.vocab_size)
|
||||
self.make_empty_intermediate_tensors = (
|
||||
self.model.make_empty_intermediate_tensors)
|
||||
|
||||
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
|
||||
return self.model.get_input_embeddings(input_ids)
|
||||
|
||||
def forward(self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||
assert intermediate_tensors is None
|
||||
assert inputs_embeds is None
|
||||
return self.model(input_ids, positions)
|
||||
return self.model(input_ids, positions, intermediate_tensors,
|
||||
inputs_embeds)
|
||||
|
||||
def compute_logits(self, hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata) -> torch.Tensor:
|
||||
|
||||
@ -191,12 +191,14 @@ _EMBEDDING_MODELS = {
|
||||
|
||||
_CROSS_ENCODER_MODELS = {
|
||||
"BertForSequenceClassification": ("bert", "BertForSequenceClassification"),
|
||||
"GteNewForSequenceClassification": ("bert_with_rope",
|
||||
"GteNewForSequenceClassification"),
|
||||
"ModernBertForSequenceClassification": ("modernbert",
|
||||
"ModernBertForSequenceClassification"),
|
||||
"RobertaForSequenceClassification": ("roberta",
|
||||
"RobertaForSequenceClassification"),
|
||||
"XLMRobertaForSequenceClassification": ("roberta",
|
||||
"RobertaForSequenceClassification"),
|
||||
"ModernBertForSequenceClassification": ("modernbert",
|
||||
"ModernBertForSequenceClassification"),
|
||||
# [Auto-converted (see adapters.py)]
|
||||
"JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501,
|
||||
}
|
||||
|
||||
@ -507,10 +507,10 @@ def merge_multimodal_embeddings(
|
||||
This updates ``inputs_embeds`` in place.
|
||||
"""
|
||||
if isinstance(placeholder_token_id, list):
|
||||
placeholder_token_id = torch.tensor(placeholder_token_id,
|
||||
pin_memory=True).to(
|
||||
device=input_ids.device,
|
||||
non_blocking=True)
|
||||
placeholder_token_id = torch.tensor(
|
||||
placeholder_token_id,
|
||||
pin_memory=is_pin_memory_available()).to(device=input_ids.device,
|
||||
non_blocking=True)
|
||||
return _merge_multimodal_embeddings(
|
||||
inputs_embeds,
|
||||
torch.isin(input_ids, placeholder_token_id),
|
||||
|
||||
@ -1,13 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Hashable
|
||||
from fractions import Fraction
|
||||
from typing import Callable, Optional, Union
|
||||
from weakref import WeakValueDictionary
|
||||
|
||||
import torch
|
||||
from torch.nn import Parameter
|
||||
|
||||
from vllm.distributed import get_tensor_model_parallel_rank
|
||||
from vllm.distributed import (get_tensor_model_parallel_rank,
|
||||
get_tensor_model_parallel_world_size)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.utils import _make_synced_weight_loader
|
||||
|
||||
@ -27,7 +30,7 @@ class BasevLLMParameter(Parameter):
|
||||
into the parameter when the provided weight loader is called.
|
||||
"""
|
||||
|
||||
def __new__(cls, data: torch.Tensor, **kwargs):
|
||||
def __new__(cls, data: Optional[torch.Tensor], **kwargs):
|
||||
|
||||
return super().__new__(cls, data=data, requires_grad=False)
|
||||
|
||||
@ -81,6 +84,17 @@ class BasevLLMParameter(Parameter):
|
||||
def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
|
||||
self._assert_and_load(loaded_weight)
|
||||
|
||||
def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
|
||||
if isinstance(shard_id, int):
|
||||
return shard_id
|
||||
|
||||
# if not int, assume shard_id for qkv
|
||||
# map to int and return
|
||||
qkv_idxs = {"q": 0, "k": 1, "v": 2}
|
||||
assert isinstance(shard_id, str)
|
||||
assert shard_id in qkv_idxs
|
||||
return qkv_idxs[shard_id]
|
||||
|
||||
|
||||
class _ColumnvLLMParameter(BasevLLMParameter):
|
||||
"""
|
||||
@ -113,6 +127,7 @@ class _ColumnvLLMParameter(BasevLLMParameter):
|
||||
|
||||
shard_offset = kwargs.get("shard_offset")
|
||||
shard_size = kwargs.get("shard_size")
|
||||
# TODO: move these to PackedColumnParameter and PackedvLLMParameter
|
||||
if isinstance(
|
||||
self,
|
||||
(PackedColumnParameter,
|
||||
@ -137,6 +152,7 @@ class _ColumnvLLMParameter(BasevLLMParameter):
|
||||
shard_id = kwargs.get("shard_id")
|
||||
num_heads = kwargs.get("num_heads")
|
||||
|
||||
# TODO: move these to PackedColumnParameter and PackedvLLMParameter
|
||||
if isinstance(
|
||||
self,
|
||||
(PackedColumnParameter,
|
||||
@ -224,19 +240,8 @@ class PerTensorScaleParameter(BasevLLMParameter):
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.qkv_idxs = {"q": 0, "k": 1, "v": 2}
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
|
||||
if isinstance(shard_id, int):
|
||||
return shard_id
|
||||
|
||||
# if not int, assume shard_id for qkv
|
||||
# map to int and return
|
||||
assert isinstance(shard_id, str)
|
||||
assert shard_id in self.qkv_idxs
|
||||
return self.qkv_idxs[shard_id]
|
||||
|
||||
# For row parallel layers, no sharding needed
|
||||
# load weight into parameter as is
|
||||
def load_row_parallel_weight(self, *args, **kwargs):
|
||||
@ -373,6 +378,141 @@ class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
|
||||
pass
|
||||
|
||||
|
||||
class SharedWeightParameter(BasevLLMParameter):
|
||||
"""
|
||||
Parameter for weights with many shared tensors across a model
|
||||
|
||||
For example, when applying transforms to the "gate" and "up" partitions of
|
||||
`MergedColumnParallelLinear`, the transform weights must stay separate
|
||||
tensors in order to allow for tensor memory sharing between layers.
|
||||
"""
|
||||
# global registry for sharing tensors based on passed `data_key`
|
||||
# this dict holds weaksrefs to avoid memory leak after model cleanup
|
||||
tensors_registry: WeakValueDictionary = WeakValueDictionary()
|
||||
|
||||
# local container for strong references to shared tensors
|
||||
# this set compensates for the fact that torch.nn.Parameter
|
||||
# and Parameter subclasses do not hold reliable references to tensors
|
||||
local_tensors: set[torch.Tensor]
|
||||
|
||||
# dictionary mapping partition indices to associated parameters
|
||||
partitions: dict[int, Union[ModelWeightParameter, Parameter]]
|
||||
|
||||
def __new__(cls, **kwargs):
|
||||
return super().__new__(cls, data=None, **kwargs)
|
||||
|
||||
def __init__(self, input_dim: int = 1, output_dim: int = 0, **kwargs):
|
||||
weight_loader: Callable = kwargs.get(
|
||||
"weight_loader") # type: ignore[assignment]
|
||||
super().__init__(data=None, weight_loader=weight_loader)
|
||||
|
||||
self.local_tensors = set()
|
||||
self.partitions = {}
|
||||
self.kwargs = {
|
||||
"input_dim": input_dim,
|
||||
"output_dim": output_dim,
|
||||
"weight_loader": self._fake_weight_loader
|
||||
}
|
||||
|
||||
self.tp_rank = get_tensor_model_parallel_rank()
|
||||
self.tp_size = get_tensor_model_parallel_world_size()
|
||||
|
||||
if self.tp_size > 1:
|
||||
raise NotImplementedError(f"{self.__class__.__name__} does not "
|
||||
"currently support tensor parallelism")
|
||||
|
||||
def add_partition(self, index: int, data_key: Hashable, *args, **kwargs):
|
||||
"""
|
||||
Add a partition to the weight parameter. Partitions whose `data_key`
|
||||
is the same will share tensor data
|
||||
|
||||
:param index: index of partition to add
|
||||
:param data_key: hashable key used to key shared tensors
|
||||
:param *args: arguments for `torch.empty`
|
||||
:param **kwargs: keyword arguments for `torch.empty`
|
||||
"""
|
||||
# load (shared) tensor using `data_key`
|
||||
if data_key not in self.tensors_registry:
|
||||
data = torch.empty(*args, **kwargs)
|
||||
self.tensors_registry[data_key] = data
|
||||
else:
|
||||
data = self.tensors_registry[data_key]
|
||||
|
||||
# create associated model parameter
|
||||
self.partitions[index] = ModelWeightParameter(
|
||||
data=data, **self.kwargs) # type: ignore[arg-type]
|
||||
|
||||
# hold local reference, since ModelWeightParameter does not
|
||||
# see https://github.com/pytorch/pytorch/issues/75932
|
||||
self.local_tensors.add(data)
|
||||
|
||||
def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
|
||||
assert len(self.partitions) == 1 and 0 in self.partitions
|
||||
partition = self.partitions[0]
|
||||
|
||||
ModelWeightParameter.load_column_parallel_weight(
|
||||
partition, loaded_weight)
|
||||
|
||||
def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
|
||||
assert len(self.partitions) == 1 and 0 in self.partitions
|
||||
partition = self.partitions[0]
|
||||
|
||||
ModelWeightParameter.load_row_parallel_weight(partition, loaded_weight)
|
||||
|
||||
def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
|
||||
partition_id = kwargs.pop("shard_id")
|
||||
partition_id = self._shard_id_as_int(partition_id)
|
||||
partition = self.partitions[partition_id]
|
||||
|
||||
input_dim = self.kwargs.get("input_dim")
|
||||
shard_size = partition.data.size(input_dim) // self.tp_size
|
||||
shard_offset = self.tp_rank * shard_size
|
||||
|
||||
ModelWeightParameter.load_merged_column_weight(
|
||||
partition,
|
||||
loaded_weight,
|
||||
shard_offset=shard_offset,
|
||||
shard_size=shard_size)
|
||||
|
||||
def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
|
||||
partition_id = self._shard_id_as_int(kwargs.pop("shard_id"))
|
||||
partition = self.partitions[partition_id]
|
||||
|
||||
input_dim = self.kwargs.get("input_dim")
|
||||
shard_size = partition.data.size(input_dim) // self.tp_size
|
||||
shard_offset = self.tp_rank * shard_size
|
||||
shard_id = "q" # fake first partition
|
||||
num_heads = kwargs.get("num_heads")
|
||||
|
||||
ModelWeightParameter.load_qkv_weight(
|
||||
partition,
|
||||
loaded_weight,
|
||||
shard_offset=shard_offset,
|
||||
shard_size=shard_size,
|
||||
shard_id=shard_id,
|
||||
num_heads=num_heads,
|
||||
)
|
||||
|
||||
def process_weights_after_loading(self):
|
||||
for key in self.partitions:
|
||||
self.partitions[key] = torch.nn.Parameter(
|
||||
data=self.partitions[key].data, requires_grad=False)
|
||||
|
||||
@property
|
||||
def data(self):
|
||||
raise ValueError("Accessing `data` of a "
|
||||
"`PartitionedModelWeightParameter` is not allowed. "
|
||||
"Instead, use `get_partition` to get the weight of "
|
||||
"the particular partition you want to access")
|
||||
|
||||
def _fake_weight_loader(self, param: BasevLLMParameter,
|
||||
loaded_weight: torch.Tensor,
|
||||
loaded_weight_shard_id: Optional[Union[str, int]]):
|
||||
raise ValueError("When loading partition weights of "
|
||||
f"{self.__class__.__name__}, use methods provided by "
|
||||
f"{self.__class__.__name__}, not partition loader")
|
||||
|
||||
|
||||
def permute_param_layout_(param: BasevLLMParameter, input_dim: int,
|
||||
output_dim: int, **kwargs) -> BasevLLMParameter:
|
||||
"""
|
||||
@ -456,4 +596,4 @@ def _adjust_shard_indexes_for_packing(shard_size, shard_offset, packed_factor,
|
||||
shard_offset=shard_offset,
|
||||
bitblas_tile_size=bitblas_tile_size)
|
||||
|
||||
return shard_size, shard_offset
|
||||
return shard_size, shard_offset
|
||||
|
||||
@ -11,7 +11,8 @@ logger = init_logger(__name__)
|
||||
|
||||
class CudagraphDispatcher:
|
||||
"""
|
||||
Runtime cudagraph dispatcher to dispach keys for multiple set of cudagraphs.
|
||||
Runtime cudagraph dispatcher to dispatch keys for multiple set of
|
||||
cudagraphs.
|
||||
|
||||
The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
|
||||
for FULL cudagraph runtime mode. The keys are initialized depending on
|
||||
@ -21,7 +22,7 @@ class CudagraphDispatcher:
|
||||
|
||||
At runtime, the dispatch method generates the runtime cudagraph mode (FULL,
|
||||
PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor)
|
||||
based on the input key. After dispatching (commuicate via forward context),
|
||||
based on the input key. After dispatching (communicate via forward context),
|
||||
the cudagraph wrappers will trust the dispatch key to do either capturing
|
||||
or replaying (if mode matched), or pass through to the underlying runnable
|
||||
without cudagraph (if mode no match or mode is NONE).
|
||||
|
||||
@ -365,9 +365,14 @@ def generate_uniform_probs(
|
||||
A tensor of shape `(num_tokens, )` containing uniform
|
||||
random values in the range [0, 1).
|
||||
"""
|
||||
# NOTE(woosuk): We deliberately use float64 instead of float32 here
|
||||
# because when using float32, there's a non-negligible chance that
|
||||
# uniform_prob is sampled to be exact 0.0 as reported in
|
||||
# https://github.com/pytorch/pytorch/issues/16706. Using float64
|
||||
# mitigates the issue.
|
||||
uniform_probs = torch.rand(
|
||||
(num_tokens, ),
|
||||
dtype=torch.float32,
|
||||
dtype=torch.float64,
|
||||
device=device,
|
||||
)
|
||||
start_idx = 0
|
||||
|
||||
@ -110,7 +110,7 @@ class BlockTable:
|
||||
self.block_table_cpu.fill_(0)
|
||||
|
||||
def get_device_tensor(self) -> torch.Tensor:
|
||||
"""Ruturns the device tensor of the block table."""
|
||||
"""Returns the device tensor of the block table."""
|
||||
return self.block_table
|
||||
|
||||
def get_cpu_tensor(self) -> torch.Tensor:
|
||||
|
||||
@ -43,7 +43,7 @@ class CPUModelRunner(GPUModelRunner):
|
||||
Args:
|
||||
scheduler_output: The scheduler output.
|
||||
"""
|
||||
# Attention free models have zero kv_cache_goups, however models
|
||||
# Attention free models have zero kv_cache_groups, however models
|
||||
# like Mamba are also attention free but use the kv_cache for
|
||||
# keeping its internal state. This is why we check the number
|
||||
# of kv_cache groups instead of solely checking
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user