mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-11 03:54:59 +08:00
[CI] change spell checker from codespell to typos (#18711)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
This commit is contained in:
parent
42f52cc95b
commit
2f1c19b245
2
.gitignore
vendored
2
.gitignore
vendored
@ -200,5 +200,5 @@ benchmarks/**/*.json
|
|||||||
actionlint
|
actionlint
|
||||||
shellcheck*/
|
shellcheck*/
|
||||||
|
|
||||||
# Ingore moe/marlin_moe gen code
|
# Ignore moe/marlin_moe gen code
|
||||||
csrc/moe/marlin_moe_wna16/kernel_*
|
csrc/moe/marlin_moe_wna16/kernel_*
|
||||||
|
|||||||
@ -20,12 +20,10 @@ repos:
|
|||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix]
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
files: ^(.buildkite|benchmarks|examples)/.*
|
files: ^(.buildkite|benchmarks|examples)/.*
|
||||||
- repo: https://github.com/codespell-project/codespell
|
- repo: https://github.com/crate-ci/typos
|
||||||
rev: v2.4.1
|
rev: v1.32.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: codespell
|
- id: typos
|
||||||
additional_dependencies: ['tomli']
|
|
||||||
args: ['--toml', 'pyproject.toml']
|
|
||||||
- repo: https://github.com/PyCQA/isort
|
- repo: https://github.com/PyCQA/isort
|
||||||
rev: 6.0.1
|
rev: 6.0.1
|
||||||
hooks:
|
hooks:
|
||||||
|
|||||||
@ -137,7 +137,7 @@ FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
|
FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
|
||||||
const int size) {
|
const int size) {
|
||||||
T max = max_data[0];
|
T max = max_data[0];
|
||||||
for (int i = 1; i < size; ++i) {
|
for (int i = 1; i < size; ++i) {
|
||||||
@ -634,7 +634,7 @@ struct paged_attention_v2_impl {
|
|||||||
|
|
||||||
if (partition_num == 1) continue;
|
if (partition_num == 1) continue;
|
||||||
|
|
||||||
reducePartitonSoftmax(
|
reducePartitionSoftmax(
|
||||||
max_logits + seq_idx * num_heads * max_num_partitions +
|
max_logits + seq_idx * num_heads * max_num_partitions +
|
||||||
head_idx * max_num_partitions,
|
head_idx * max_num_partitions,
|
||||||
exp_sums + seq_idx * num_heads * max_num_partitions +
|
exp_sums + seq_idx * num_heads * max_num_partitions +
|
||||||
|
|||||||
@ -83,7 +83,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
|
|||||||
explicit FP16Vec16(const void* ptr)
|
explicit FP16Vec16(const void* ptr)
|
||||||
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
||||||
|
|
||||||
// non-temproal load
|
// non-temporal load
|
||||||
explicit FP16Vec16(bool, void* ptr)
|
explicit FP16Vec16(bool, void* ptr)
|
||||||
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
||||||
|
|
||||||
@ -120,7 +120,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
|
|||||||
explicit BF16Vec16(const void* ptr)
|
explicit BF16Vec16(const void* ptr)
|
||||||
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
||||||
|
|
||||||
// non-temproal load
|
// non-temporal load
|
||||||
explicit BF16Vec16(bool, void* ptr)
|
explicit BF16Vec16(bool, void* ptr)
|
||||||
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
||||||
|
|
||||||
@ -327,7 +327,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
|||||||
// normal load
|
// normal load
|
||||||
explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
||||||
|
|
||||||
// non-temproal load
|
// non-temporal load
|
||||||
explicit FP32Vec16(bool, void* ptr)
|
explicit FP32Vec16(bool, void* ptr)
|
||||||
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
|
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
|
||||||
|
|
||||||
@ -576,7 +576,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
|
|||||||
// normal load
|
// normal load
|
||||||
explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
|
explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
|
||||||
|
|
||||||
// non-temproal load
|
// non-temporal load
|
||||||
explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
|
explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
|
||||||
|
|
||||||
void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
|
void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
|
||||||
@ -587,7 +587,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
|
|||||||
_mm512_mask_storeu_epi8(ptr, mask, reg);
|
_mm512_mask_storeu_epi8(ptr, mask, reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
// non-temproal save
|
// non-temporal save
|
||||||
void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
|
void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
|
||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -12,7 +12,7 @@ void moe_permute(
|
|||||||
const torch::Tensor& input, // [n_token, hidden]
|
const torch::Tensor& input, // [n_token, hidden]
|
||||||
const torch::Tensor& topk_weights, //[n_token, topk]
|
const torch::Tensor& topk_weights, //[n_token, topk]
|
||||||
torch::Tensor& topk_ids, // [n_token, topk]
|
torch::Tensor& topk_ids, // [n_token, topk]
|
||||||
const torch::Tensor& token_expert_indicies, // [n_token, topk]
|
const torch::Tensor& token_expert_indices, // [n_token, topk]
|
||||||
const std::optional<torch::Tensor>& expert_map, // [n_expert]
|
const std::optional<torch::Tensor>& expert_map, // [n_expert]
|
||||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||||
const std::optional<int64_t>& align_block_size,
|
const std::optional<int64_t>& align_block_size,
|
||||||
@ -27,15 +27,15 @@ void moe_permute(
|
|||||||
"expert_first_token_offset must be int64");
|
"expert_first_token_offset must be int64");
|
||||||
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
||||||
"topk_ids must be int32");
|
"topk_ids must be int32");
|
||||||
TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int,
|
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
|
||||||
"token_expert_indicies must be int32");
|
"token_expert_indices must be int32");
|
||||||
TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
|
TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
|
||||||
"src_row_id2dst_row_id_map must be int32");
|
"src_row_id2dst_row_id_map must be int32");
|
||||||
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
|
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
|
||||||
"expert_first_token_offset shape != n_local_expert+1")
|
"expert_first_token_offset shape != n_local_expert+1")
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(),
|
src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
|
||||||
"token_expert_indicies shape must be same as src_row_id2dst_row_id_map");
|
"token_expert_indices shape must be same as src_row_id2dst_row_id_map");
|
||||||
auto n_token = input.sizes()[0];
|
auto n_token = input.sizes()[0];
|
||||||
auto n_hidden = input.sizes()[1];
|
auto n_hidden = input.sizes()[1];
|
||||||
auto align_block_size_value =
|
auto align_block_size_value =
|
||||||
@ -71,7 +71,7 @@ void moe_permute(
|
|||||||
expert_map_ptr, n_expert, stream);
|
expert_map_ptr, n_expert, stream);
|
||||||
}
|
}
|
||||||
// expert sort topk expert id and scan expert id get expert_first_token_offset
|
// expert sort topk expert id and scan expert id get expert_first_token_offset
|
||||||
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indicies),
|
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
|
||||||
get_ptr<int>(permuted_experts_id),
|
get_ptr<int>(permuted_experts_id),
|
||||||
get_ptr<int>(dst_row_id2src_row_id_map),
|
get_ptr<int>(dst_row_id2src_row_id_map),
|
||||||
get_ptr<int64_t>(expert_first_token_offset), n_token,
|
get_ptr<int64_t>(expert_first_token_offset), n_token,
|
||||||
@ -190,7 +190,7 @@ void shuffle_rows(const torch::Tensor& input_tensor,
|
|||||||
|
|
||||||
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
|
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
|
||||||
torch::Tensor& topk_ids,
|
torch::Tensor& topk_ids,
|
||||||
const torch::Tensor& token_expert_indicies,
|
const torch::Tensor& token_expert_indices,
|
||||||
const std::optional<torch::Tensor>& expert_map,
|
const std::optional<torch::Tensor>& expert_map,
|
||||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||||
const std::optional<int64_t>& align_block_size,
|
const std::optional<int64_t>& align_block_size,
|
||||||
@ -203,7 +203,7 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
|
|||||||
|
|
||||||
void moe_unpermute(const torch::Tensor& input,
|
void moe_unpermute(const torch::Tensor& input,
|
||||||
const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
|
const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
|
||||||
const torch::Tensor& token_expert_indicies,
|
const torch::Tensor& token_expert_indices,
|
||||||
const std::optional<torch::Tensor>& expert_map,
|
const std::optional<torch::Tensor>& expert_map,
|
||||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||||
const std::optional<int64_t>& align_block_size,
|
const std::optional<int64_t>& align_block_size,
|
||||||
|
|||||||
@ -425,7 +425,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
|||||||
|
|
||||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \
|
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \
|
||||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>( \
|
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>( \
|
||||||
gating_output, nullptr, topk_weights, topk_indicies, \
|
gating_output, nullptr, topk_weights, topk_indices, \
|
||||||
token_expert_indices, num_tokens, topk, 0, num_experts, \
|
token_expert_indices, num_tokens, topk, 0, num_experts, \
|
||||||
stream);
|
stream);
|
||||||
|
|
||||||
@ -433,7 +433,7 @@ template <typename IndType>
|
|||||||
void topkGatingSoftmaxKernelLauncher(
|
void topkGatingSoftmaxKernelLauncher(
|
||||||
const float* gating_output,
|
const float* gating_output,
|
||||||
float* topk_weights,
|
float* topk_weights,
|
||||||
IndType* topk_indicies,
|
IndType* topk_indices,
|
||||||
int* token_expert_indices,
|
int* token_expert_indices,
|
||||||
float* softmax_workspace,
|
float* softmax_workspace,
|
||||||
const int num_tokens,
|
const int num_tokens,
|
||||||
@ -476,7 +476,7 @@ void topkGatingSoftmaxKernelLauncher(
|
|||||||
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
|
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||||
gating_output, nullptr, softmax_workspace, num_experts);
|
gating_output, nullptr, softmax_workspace, num_experts);
|
||||||
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
|
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||||
softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices,
|
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
|
||||||
num_experts, topk, 0, num_experts);
|
num_experts, topk, 0, num_experts);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -66,7 +66,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
|||||||
|
|
||||||
m.def(
|
m.def(
|
||||||
"moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
|
"moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
|
||||||
"Tensor token_expert_indicies, Tensor? expert_map, int n_expert,"
|
"Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
|
||||||
"int n_local_expert,"
|
"int n_local_expert,"
|
||||||
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
|
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
|
||||||
"expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
|
"expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
|
||||||
|
|||||||
@ -1003,7 +1003,7 @@ struct MacheteCollectiveMma {
|
|||||||
static constexpr int A_CPY_VEC =
|
static constexpr int A_CPY_VEC =
|
||||||
decltype(max_common_vector(tCsA, tCrA_load)){};
|
decltype(max_common_vector(tCsA, tCrA_load)){};
|
||||||
|
|
||||||
static constexpr int COVERSION_WIDTH =
|
static constexpr int CONVERSION_WIDTH =
|
||||||
std::min(A_CPY_VEC, int(size<0>(tCrA_mma)));
|
std::min(A_CPY_VEC, int(size<0>(tCrA_mma)));
|
||||||
|
|
||||||
auto load_A_to_registers = [&](int read_stage) {
|
auto load_A_to_registers = [&](int read_stage) {
|
||||||
@ -1026,7 +1026,7 @@ struct MacheteCollectiveMma {
|
|||||||
// PIPELINED MAIN LOOP
|
// PIPELINED MAIN LOOP
|
||||||
//
|
//
|
||||||
|
|
||||||
auto convert_A = [&, a_vec = Int<COVERSION_WIDTH>{}](int k_block,
|
auto convert_A = [&, a_vec = Int<CONVERSION_WIDTH>{}](int k_block,
|
||||||
int read_stage) {
|
int read_stage) {
|
||||||
load_extra_info_to_registers(partitioned_extra_info,
|
load_extra_info_to_registers(partitioned_extra_info,
|
||||||
copy_partitions_extra_info, k_block,
|
copy_partitions_extra_info, k_block,
|
||||||
|
|||||||
@ -320,7 +320,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
// Goal is to bring the activation matrix A to the LDS
|
// Goal is to bring the activation matrix A to the LDS
|
||||||
// and use it across the lifetime of the work group
|
// and use it across the lifetime of the work group
|
||||||
// TODO: When activation matrix is larger than 64 KB
|
// TODO: When activation matrix is larger than 64 KB
|
||||||
// then this is not goint to work!
|
// then this is not going to work!
|
||||||
//----------------------------------------------------
|
//----------------------------------------------------
|
||||||
__shared__ scalar_t s[max_lds_len];
|
__shared__ scalar_t s[max_lds_len];
|
||||||
|
|
||||||
@ -581,7 +581,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
// Goal is to bring the activation matrix A to the LDS
|
// Goal is to bring the activation matrix A to the LDS
|
||||||
// and use it across the lifetime of the work group
|
// and use it across the lifetime of the work group
|
||||||
// TODO: When activation matrix is larger than 64 KB
|
// TODO: When activation matrix is larger than 64 KB
|
||||||
// then this is not goint to work!
|
// then this is not going to work!
|
||||||
//----------------------------------------------------
|
//----------------------------------------------------
|
||||||
__shared__ scalar_t s[max_lds_len];
|
__shared__ scalar_t s[max_lds_len];
|
||||||
|
|
||||||
@ -601,7 +601,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
// int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
|
// int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
|
||||||
uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
|
uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
|
||||||
|
|
||||||
// Check whether there will be fragmenation!
|
// Check whether there will be fragmentation!
|
||||||
// This will happen only for the last wave!
|
// This will happen only for the last wave!
|
||||||
if (m < M && (m + YTILE) >= M) {
|
if (m < M && (m + YTILE) >= M) {
|
||||||
uint32_t startColumn = M - YTILE;
|
uint32_t startColumn = M - YTILE;
|
||||||
@ -827,7 +827,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
|
|
||||||
m += CuCount * _WvPrGrp * YTILE;
|
m += CuCount * _WvPrGrp * YTILE;
|
||||||
|
|
||||||
// Check whether there will be fragmenation!
|
// Check whether there will be fragmentation!
|
||||||
// This will happen only for the last wave!
|
// This will happen only for the last wave!
|
||||||
if (m < M && (m + YTILE) >= M) {
|
if (m < M && (m + YTILE) >= M) {
|
||||||
uint32_t startColumn = M - YTILE;
|
uint32_t startColumn = M - YTILE;
|
||||||
@ -882,7 +882,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
// Goal is to bring the activation matrix A to the LDS
|
// Goal is to bring the activation matrix A to the LDS
|
||||||
// and use it across the lifetime of the work group
|
// and use it across the lifetime of the work group
|
||||||
// TODO: When activation matrix is larger than 64 KB
|
// TODO: When activation matrix is larger than 64 KB
|
||||||
// then this is not goint to work!
|
// then this is not going to work!
|
||||||
//----------------------------------------------------
|
//----------------------------------------------------
|
||||||
__shared__ scalar_t s[max_lds_len];
|
__shared__ scalar_t s[max_lds_len];
|
||||||
|
|
||||||
@ -904,7 +904,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
//----------------------------------------------------
|
//----------------------------------------------------
|
||||||
uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
|
uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
|
||||||
|
|
||||||
// Check whether there will be fragmenation!
|
// Check whether there will be fragmentation!
|
||||||
// This will happen only for the last wave!
|
// This will happen only for the last wave!
|
||||||
if (m < M && (m + YTILE) >= M) {
|
if (m < M && (m + YTILE) >= M) {
|
||||||
uint32_t startColumn = M - YTILE;
|
uint32_t startColumn = M - YTILE;
|
||||||
@ -1176,7 +1176,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
|
|||||||
m += CuCount * _WvPrGrp * YTILE;
|
m += CuCount * _WvPrGrp * YTILE;
|
||||||
kBase = 0;
|
kBase = 0;
|
||||||
|
|
||||||
// Check whether there will be fragmenation!
|
// Check whether there will be fragmentation!
|
||||||
// This will happen only for the last wave!
|
// This will happen only for the last wave!
|
||||||
if (m < M && (m + YTILE) >= M) {
|
if (m < M && (m + YTILE) >= M) {
|
||||||
uint32_t startColumn = M - YTILE;
|
uint32_t startColumn = M - YTILE;
|
||||||
|
|||||||
@ -277,7 +277,7 @@ CompressorResult cutlass_sparse_compress_sm90(torch::Tensor const& a) {
|
|||||||
uint32_t const m = 1; // Set M to 1 for compression
|
uint32_t const m = 1; // Set M to 1 for compression
|
||||||
uint32_t const n = a.size(1);
|
uint32_t const n = a.size(1);
|
||||||
|
|
||||||
// Note: For correctess, the compressed format must be invariant in:
|
// Note: For correctness, the compressed format must be invariant in:
|
||||||
// - M, the flattened number of tokens
|
// - M, the flattened number of tokens
|
||||||
// - Whether output dtype is fp16 or bf16
|
// - Whether output dtype is fp16 or bf16
|
||||||
// - CUTLASS epilogues
|
// - CUTLASS epilogues
|
||||||
|
|||||||
@ -137,10 +137,6 @@ exclude = [
|
|||||||
'vllm/attention/ops/.*\.py$'
|
'vllm/attention/ops/.*\.py$'
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.codespell]
|
|
||||||
ignore-words-list = "dout, te, indicies, subtile, ElementE"
|
|
||||||
skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*"
|
|
||||||
|
|
||||||
[tool.isort]
|
[tool.isort]
|
||||||
skip_glob = [
|
skip_glob = [
|
||||||
".buildkite/*",
|
".buildkite/*",
|
||||||
|
|||||||
@ -223,7 +223,7 @@ def test_async_tp_pass_correctness(
|
|||||||
"VLLM_USE_V1": "1",
|
"VLLM_USE_V1": "1",
|
||||||
}
|
}
|
||||||
|
|
||||||
aysnc_tp_args = [
|
async_tp_args = [
|
||||||
*common_args,
|
*common_args,
|
||||||
"--tensor-parallel-size",
|
"--tensor-parallel-size",
|
||||||
str(tp_size),
|
str(tp_size),
|
||||||
@ -242,7 +242,7 @@ def test_async_tp_pass_correctness(
|
|||||||
]
|
]
|
||||||
|
|
||||||
compare_two_settings(model_id,
|
compare_two_settings(model_id,
|
||||||
aysnc_tp_args,
|
async_tp_args,
|
||||||
tp_args,
|
tp_args,
|
||||||
async_tp_env,
|
async_tp_env,
|
||||||
tp_env,
|
tp_env,
|
||||||
|
|||||||
@ -437,7 +437,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
|
|||||||
"enable_prefix_caching": True,
|
"enable_prefix_caching": True,
|
||||||
}])
|
}])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
|
def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
|
||||||
test_llm_generator):
|
test_llm_generator):
|
||||||
"""Verify block manager v2 with auto prefix caching could works normal
|
"""Verify block manager v2 with auto prefix caching could works normal
|
||||||
even when eviction started.
|
even when eviction started.
|
||||||
|
|||||||
@ -33,7 +33,7 @@ BLOCK_SIZE = 16
|
|||||||
@pytest.mark.parametrize("batch_size", [5])
|
@pytest.mark.parametrize("batch_size", [5])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
|
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
|
||||||
def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
|
def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
|
||||||
batch_size, seed, backend, monkeypatch):
|
batch_size, seed, backend, monkeypatch):
|
||||||
"""
|
"""
|
||||||
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
|
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
|
||||||
@ -100,7 +100,7 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
|
|||||||
def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
|
def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
|
||||||
backend, monkeypatch):
|
backend, monkeypatch):
|
||||||
"""
|
"""
|
||||||
This is similar to test_sliding_window_retrival, however, it doesn't
|
This is similar to test_sliding_window_retrieval, however, it doesn't
|
||||||
compare against the v1 block manager since v1 doesn't support
|
compare against the v1 block manager since v1 doesn't support
|
||||||
chunked prefill with sliding window.
|
chunked prefill with sliding window.
|
||||||
|
|
||||||
|
|||||||
@ -594,8 +594,8 @@ def test_decode_schedule_preempted():
|
|||||||
# should be preempted. 1 will also be preempted.
|
# should be preempted. 1 will also be preempted.
|
||||||
budget = create_token_budget()
|
budget = create_token_budget()
|
||||||
output = scheduler._schedule_running(budget, curr_loras)
|
output = scheduler._schedule_running(budget, curr_loras)
|
||||||
remainig_running = scheduler.running
|
remaining_running = scheduler.running
|
||||||
assert len(remainig_running) == 0
|
assert len(remaining_running) == 0
|
||||||
assert len(output.decode_seq_groups) == 1
|
assert len(output.decode_seq_groups) == 1
|
||||||
assert len(output.prefill_seq_groups) == 0
|
assert len(output.prefill_seq_groups) == 0
|
||||||
assert output.decode_seq_groups[0].seq_group.request_id == "0"
|
assert output.decode_seq_groups[0].seq_group.request_id == "0"
|
||||||
|
|||||||
@ -16,7 +16,7 @@ chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
|
|||||||
assert chatml_jinja_path.exists()
|
assert chatml_jinja_path.exists()
|
||||||
|
|
||||||
# Define models, templates, and their corresponding expected outputs
|
# Define models, templates, and their corresponding expected outputs
|
||||||
MODEL_TEMPLATE_GENERATON_OUTPUT = [
|
MODEL_TEMPLATE_GENERATION_OUTPUT = [
|
||||||
("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
|
("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
|
||||||
Hello<|im_end|>
|
Hello<|im_end|>
|
||||||
<|im_start|>assistant
|
<|im_start|>assistant
|
||||||
@ -91,7 +91,7 @@ def test_no_load_chat_template_literallike():
|
|||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model,template,add_generation_prompt,continue_final_message,expected_output",
|
"model,template,add_generation_prompt,continue_final_message,expected_output",
|
||||||
MODEL_TEMPLATE_GENERATON_OUTPUT)
|
MODEL_TEMPLATE_GENERATION_OUTPUT)
|
||||||
def test_get_gen_prompt(model, template, add_generation_prompt,
|
def test_get_gen_prompt(model, template, add_generation_prompt,
|
||||||
continue_final_message, expected_output):
|
continue_final_message, expected_output):
|
||||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||||
|
|||||||
@ -72,8 +72,8 @@ def test_copy_blocks(
|
|||||||
# destination blocks.
|
# destination blocks.
|
||||||
assert 2 * num_mappings <= num_blocks
|
assert 2 * num_mappings <= num_blocks
|
||||||
src_blocks = random.sample(range(num_blocks), num_mappings)
|
src_blocks = random.sample(range(num_blocks), num_mappings)
|
||||||
remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
|
remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
|
||||||
dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
|
dst_blocks = random.sample(remaining_blocks, 2 * num_mappings)
|
||||||
block_mapping: list[tuple[int, int]] = []
|
block_mapping: list[tuple[int, int]] = []
|
||||||
for i in range(num_mappings):
|
for i in range(num_mappings):
|
||||||
src = src_blocks[i]
|
src = src_blocks[i]
|
||||||
@ -189,12 +189,12 @@ def test_reshape_and_cache(
|
|||||||
|
|
||||||
# Run the reference implementation.
|
# Run the reference implementation.
|
||||||
reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
|
reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
|
||||||
block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
|
block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
|
||||||
block_indicies_lst = block_indicies.cpu().tolist()
|
block_indices_lst = block_indices.cpu().tolist()
|
||||||
block_offsets = slot_mapping % block_size
|
block_offsets = slot_mapping % block_size
|
||||||
block_offsets_lst = block_offsets.cpu().tolist()
|
block_offsets_lst = block_offsets.cpu().tolist()
|
||||||
for i in range(num_tokens):
|
for i in range(num_tokens):
|
||||||
block_idx = block_indicies_lst[i]
|
block_idx = block_indices_lst[i]
|
||||||
block_offset = block_offsets_lst[i]
|
block_offset = block_offsets_lst[i]
|
||||||
cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
|
cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
|
||||||
cloned_value_cache[block_idx, :, :, block_offset] = value[i]
|
cloned_value_cache[block_idx, :, :, block_offset] = value[i]
|
||||||
@ -322,12 +322,12 @@ def test_reshape_and_cache_flash(
|
|||||||
kv_dtype=kv_cache_dtype)
|
kv_dtype=kv_cache_dtype)
|
||||||
|
|
||||||
# Run the reference implementation.
|
# Run the reference implementation.
|
||||||
block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
|
block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
|
||||||
block_indicies_lst = block_indicies.cpu().tolist()
|
block_indices_lst = block_indices.cpu().tolist()
|
||||||
block_offsets = slot_mapping % block_size
|
block_offsets = slot_mapping % block_size
|
||||||
block_offsets_lst = block_offsets.cpu().tolist()
|
block_offsets_lst = block_offsets.cpu().tolist()
|
||||||
for i in range(num_tokens):
|
for i in range(num_tokens):
|
||||||
block_idx = block_indicies_lst[i]
|
block_idx = block_indices_lst[i]
|
||||||
block_offset = block_offsets_lst[i]
|
block_offset = block_offsets_lst[i]
|
||||||
if kv_cache_layout == "NHD":
|
if kv_cache_layout == "NHD":
|
||||||
cloned_key_cache[block_idx, block_offset, :, :] = key[i]
|
cloned_key_cache[block_idx, block_offset, :, :] = key[i]
|
||||||
|
|||||||
@ -46,7 +46,7 @@ CUDA_DEVICE = "cuda:0"
|
|||||||
MAX_DEC_SEQ_LENS = [128]
|
MAX_DEC_SEQ_LENS = [128]
|
||||||
MAX_ENC_SEQ_LENS = [128]
|
MAX_ENC_SEQ_LENS = [128]
|
||||||
|
|
||||||
# Narrow teest-cases for unsupported-scenario
|
# Narrow test-cases for unsupported-scenario
|
||||||
# tests
|
# tests
|
||||||
HEAD_SIZES_FOR_UNSUPP = [HEAD_SIZES[0]]
|
HEAD_SIZES_FOR_UNSUPP = [HEAD_SIZES[0]]
|
||||||
|
|
||||||
|
|||||||
@ -39,10 +39,10 @@ def rotary_embedding_opcheck(rot,
|
|||||||
@pytest.mark.parametrize("head_size", [32, 108])
|
@pytest.mark.parametrize("head_size", [32, 108])
|
||||||
@pytest.mark.parametrize("seq_len", [11, 1024])
|
@pytest.mark.parametrize("seq_len", [11, 1024])
|
||||||
@pytest.mark.parametrize("use_key", [True, False])
|
@pytest.mark.parametrize("use_key", [True, False])
|
||||||
@pytest.mark.parametrize("head_stride_is_contingous", [True, False])
|
@pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
|
||||||
def test_rotary_embedding_opcheck(dist_init, device, max_position,
|
def test_rotary_embedding_opcheck(dist_init, device, max_position,
|
||||||
is_neox_style, rotary_dim, head_size,
|
is_neox_style, rotary_dim, head_size,
|
||||||
seq_len, use_key, head_stride_is_contingous):
|
seq_len, use_key, head_stride_is_contiguous):
|
||||||
batch_size = 1
|
batch_size = 1
|
||||||
base = 10000
|
base = 10000
|
||||||
num_heads = 7
|
num_heads = 7
|
||||||
@ -52,7 +52,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
|
|||||||
positions = torch.randint(0,
|
positions = torch.randint(0,
|
||||||
max_position, (batch_size, seq_len),
|
max_position, (batch_size, seq_len),
|
||||||
device=device)
|
device=device)
|
||||||
head_stride = head_size + (64 if head_stride_is_contingous else 0)
|
head_stride = head_size + (64 if head_stride_is_contiguous else 0)
|
||||||
|
|
||||||
query = torch.randn(batch_size,
|
query = torch.randn(batch_size,
|
||||||
seq_len,
|
seq_len,
|
||||||
@ -72,7 +72,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
|
|||||||
|
|
||||||
# if we have a contiguous head stride, test the alternate
|
# if we have a contiguous head stride, test the alternate
|
||||||
# [..., num_heads * head_dim] shape/layout
|
# [..., num_heads * head_dim] shape/layout
|
||||||
if head_stride_is_contingous:
|
if head_stride_is_contiguous:
|
||||||
rotary_embedding_opcheck(
|
rotary_embedding_opcheck(
|
||||||
rot, positions, query.flatten(start_dim=-2),
|
rot, positions, query.flatten(start_dim=-2),
|
||||||
key.flatten(start_dim=-2) if use_key else None)
|
key.flatten(start_dim=-2) if use_key else None)
|
||||||
|
|||||||
@ -107,7 +107,7 @@ def generate_random_inputs(batch_size,
|
|||||||
return A, dt, X, B, C
|
return A, dt, X, B, C
|
||||||
|
|
||||||
|
|
||||||
def generate_continous_batched_examples(example_lens_by_batch,
|
def generate_continuous_batched_examples(example_lens_by_batch,
|
||||||
num_examples,
|
num_examples,
|
||||||
full_length,
|
full_length,
|
||||||
last_taken,
|
last_taken,
|
||||||
@ -269,10 +269,9 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
|
|||||||
exhausted: dict = {} # map: eg -> boolean indicating example is exhausted
|
exhausted: dict = {} # map: eg -> boolean indicating example is exhausted
|
||||||
|
|
||||||
states = None
|
states = None
|
||||||
for Y_min, cu_seqlens, seq_idx, (A, dt, X, B,
|
for Y_min, cu_seqlens, seq_idx, (
|
||||||
C) in generate_continous_batched_examples(
|
A, dt, X, B, C) in generate_continuous_batched_examples(
|
||||||
cases, num_examples, seqlen,
|
cases, num_examples, seqlen, last_taken, exhausted, n_heads,
|
||||||
last_taken, exhausted, n_heads,
|
|
||||||
d_head, itype):
|
d_head, itype):
|
||||||
|
|
||||||
chunk_indices, chunk_offsets = \
|
chunk_indices, chunk_offsets = \
|
||||||
|
|||||||
@ -118,7 +118,7 @@ def run_test(
|
|||||||
# default to enforce_eager=True if enforce_eager
|
# default to enforce_eager=True if enforce_eager
|
||||||
# is left unspecified. However, the
|
# is left unspecified. However, the
|
||||||
# VllmRunner test fixture (which wraps around the LLM class) defaults to
|
# VllmRunner test fixture (which wraps around the LLM class) defaults to
|
||||||
# enforce_eager=False (a behavior which a number of already-exisitng
|
# enforce_eager=False (a behavior which a number of already-existing
|
||||||
# decoder-only unit tests expect), so when testing an encoder/decoder
|
# decoder-only unit tests expect), so when testing an encoder/decoder
|
||||||
# model we must explicitly specify enforce_eager=True in the VllmRunner
|
# model we must explicitly specify enforce_eager=True in the VllmRunner
|
||||||
# constructor.
|
# constructor.
|
||||||
|
|||||||
@ -248,7 +248,7 @@ def test_temperature_zero_target_distribution(seed: int, device: str):
|
|||||||
size=(batch_size, 1),
|
size=(batch_size, 1),
|
||||||
dtype=torch.int64)
|
dtype=torch.int64)
|
||||||
# The target probaility distribution is a temperature zero distribution
|
# The target probaility distribution is a temperature zero distribution
|
||||||
# with zero entroy. Since our draft token ids don't match the probability
|
# with zero entropy. Since our draft token ids don't match the probability
|
||||||
# 1.0 tokens in the target distribution we will reject all of them and
|
# 1.0 tokens in the target distribution we will reject all of them and
|
||||||
# fallback to the greedy sampling for selecting 1 token for each sequence.
|
# fallback to the greedy sampling for selecting 1 token for each sequence.
|
||||||
# Verify the same.
|
# Verify the same.
|
||||||
|
|||||||
@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
|
|||||||
* Test greedy equality under various number of speculative tokens.
|
* Test greedy equality under various number of speculative tokens.
|
||||||
|
|
||||||
With those tests, we can say at least, EAGLE would not break the
|
With those tests, we can say at least, EAGLE would not break the
|
||||||
correctess for the target model outputs.
|
correctness for the target model outputs.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|||||||
@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
|
|||||||
* Test greedy equality under various number of speculative tokens.
|
* Test greedy equality under various number of speculative tokens.
|
||||||
|
|
||||||
With those tests, we can say at least, Medusa would not break the
|
With those tests, we can say at least, Medusa would not break the
|
||||||
correctess for the target model outputs.
|
correctness for the target model outputs.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|||||||
@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
|
|||||||
* Test greedy equality under various number of speculative tokens.
|
* Test greedy equality under various number of speculative tokens.
|
||||||
|
|
||||||
With those tests, we can say at least, mtp would not break the
|
With those tests, we can say at least, mtp would not break the
|
||||||
correctess for the target model outputs.
|
correctness for the target model outputs.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|||||||
@ -22,8 +22,8 @@ However, we still need to verify below scenario could be passed:
|
|||||||
* Test greedy equality under preemption
|
* Test greedy equality under preemption
|
||||||
* Test greedy equality under various ngram sizes / speculative sizes
|
* Test greedy equality under various ngram sizes / speculative sizes
|
||||||
|
|
||||||
With those tests, we can say at least, ngram spec would not break the correctess
|
With those tests, we can say at least, ngram spec would not break the
|
||||||
for the target model outputs.
|
correctness for the target model outputs.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|||||||
@ -30,7 +30,7 @@ model_config = {
|
|||||||
])
|
])
|
||||||
@pytest.mark.parametrize("batch_size", [5])
|
@pytest.mark.parametrize("batch_size", [5])
|
||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
def test_sliding_window_retrival(monkeypatch, model, batch_size, seed):
|
def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed):
|
||||||
"""
|
"""
|
||||||
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
|
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
|
||||||
asks for value of one of them (which is outside the sliding window).
|
asks for value of one of them (which is outside the sliding window).
|
||||||
|
|||||||
@ -7,7 +7,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
|
|||||||
from .utils import create_request, create_scheduler, create_vllm_config
|
from .utils import create_request, create_scheduler, create_vllm_config
|
||||||
|
|
||||||
|
|
||||||
def test_basic_inferface():
|
def test_basic_interface():
|
||||||
"""Unit test for basic NixlConnector interface functionality."""
|
"""Unit test for basic NixlConnector interface functionality."""
|
||||||
|
|
||||||
vllm_config = create_vllm_config()
|
vllm_config = create_vllm_config()
|
||||||
@ -25,7 +25,7 @@ def test_basic_inferface():
|
|||||||
|
|
||||||
scheduler.add_request(request)
|
scheduler.add_request(request)
|
||||||
|
|
||||||
# Remote Prefill, triggers NixlConnectorMetdata.
|
# Remote Prefill, triggers NixlConnectorMetadata.
|
||||||
scheduler_output = scheduler.schedule()
|
scheduler_output = scheduler.schedule()
|
||||||
kv_connector_metadata = scheduler_output.kv_connector_metadata
|
kv_connector_metadata = scheduler_output.kv_connector_metadata
|
||||||
assert kv_connector_metadata is not None
|
assert kv_connector_metadata is not None
|
||||||
|
|||||||
@ -32,7 +32,7 @@ def test_prompt_logprobs_e2e():
|
|||||||
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
|
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
|
||||||
|
|
||||||
|
|
||||||
def test_promt_logprobs_e2e_server():
|
def test_prompt_logprobs_e2e_server():
|
||||||
with RemoteOpenAIServer(MODEL, SERVER_ARGS) as remote_server:
|
with RemoteOpenAIServer(MODEL, SERVER_ARGS) as remote_server:
|
||||||
url = f"{remote_server.url_for('v1')}/completions"
|
url = f"{remote_server.url_for('v1')}/completions"
|
||||||
|
|
||||||
|
|||||||
@ -209,32 +209,32 @@ def test_multi_step_model_runner_input():
|
|||||||
received_model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
|
received_model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
|
||||||
tensor_dict, attn_backend=attn_backend))
|
tensor_dict, attn_backend=attn_backend))
|
||||||
|
|
||||||
receieved_frozen_input = received_model_input.frozen_model_input
|
received_frozen_input = received_model_input.frozen_model_input
|
||||||
|
|
||||||
# Check that received copy has correct values.
|
# Check that received copy has correct values.
|
||||||
assert isinstance(received_model_input, StatefulModelInput)
|
assert isinstance(received_model_input, StatefulModelInput)
|
||||||
assert receieved_frozen_input.input_tokens is not None
|
assert received_frozen_input.input_tokens is not None
|
||||||
assert (receieved_frozen_input.input_tokens ==
|
assert (received_frozen_input.input_tokens ==
|
||||||
frozen_model_input.input_tokens).all()
|
frozen_model_input.input_tokens).all()
|
||||||
assert receieved_frozen_input.input_positions is not None
|
assert received_frozen_input.input_positions is not None
|
||||||
assert (receieved_frozen_input.input_positions ==
|
assert (received_frozen_input.input_positions ==
|
||||||
frozen_model_input.input_positions).all()
|
frozen_model_input.input_positions).all()
|
||||||
assert receieved_frozen_input.multi_modal_kwargs is None
|
assert received_frozen_input.multi_modal_kwargs is None
|
||||||
assert (frozen_model_input.multi_modal_kwargs ==
|
assert (frozen_model_input.multi_modal_kwargs ==
|
||||||
frozen_model_input.multi_modal_kwargs)
|
frozen_model_input.multi_modal_kwargs)
|
||||||
assert receieved_frozen_input.lora_requests is None
|
assert received_frozen_input.lora_requests is None
|
||||||
assert (receieved_frozen_input.lora_requests ==
|
assert (received_frozen_input.lora_requests ==
|
||||||
frozen_model_input.lora_requests)
|
frozen_model_input.lora_requests)
|
||||||
assert receieved_frozen_input.lora_mapping is None
|
assert received_frozen_input.lora_mapping is None
|
||||||
assert (
|
assert (
|
||||||
receieved_frozen_input.lora_mapping == frozen_model_input.lora_mapping)
|
received_frozen_input.lora_mapping == frozen_model_input.lora_mapping)
|
||||||
for field in dataclasses.fields(AttentionMetadata):
|
for field in dataclasses.fields(AttentionMetadata):
|
||||||
assert getattr(receieved_frozen_input.attn_metadata, field.name,
|
assert getattr(received_frozen_input.attn_metadata, field.name,
|
||||||
None) == getattr(attn_metadata, field.name, None)
|
None) == getattr(attn_metadata, field.name, None)
|
||||||
# For sampling metadata, only selected_token_indices is copied.
|
# For sampling metadata, only selected_token_indices is copied.
|
||||||
assert (receieved_frozen_input.sampling_metadata.selected_token_indices ==
|
assert (received_frozen_input.sampling_metadata.selected_token_indices ==
|
||||||
sampling_metadata.selected_token_indices)
|
sampling_metadata.selected_token_indices)
|
||||||
assert receieved_frozen_input.sampling_metadata.seq_groups is None
|
assert received_frozen_input.sampling_metadata.seq_groups is None
|
||||||
|
|
||||||
# check non frozen fields
|
# check non frozen fields
|
||||||
assert received_model_input.is_last_step == model_input.is_last_step
|
assert received_model_input.is_last_step == model_input.is_last_step
|
||||||
|
|||||||
@ -116,7 +116,7 @@ def ReadTargets(log, show_all):
|
|||||||
# If ninja.exe is rudely halted then the .ninja_log file may be
|
# If ninja.exe is rudely halted then the .ninja_log file may be
|
||||||
# corrupt. Silently continue.
|
# corrupt. Silently continue.
|
||||||
continue
|
continue
|
||||||
start, end, _, name, cmdhash = parts # Ignore restat.
|
start, end, _, name, cmdhash = parts # Ignore restart.
|
||||||
# Convert from integral milliseconds to float seconds.
|
# Convert from integral milliseconds to float seconds.
|
||||||
start = int(start) / 1000.0
|
start = int(start) / 1000.0
|
||||||
end = int(end) / 1000.0
|
end = int(end) / 1000.0
|
||||||
|
|||||||
179
typos.toml
Normal file
179
typos.toml
Normal file
@ -0,0 +1,179 @@
|
|||||||
|
[files]
|
||||||
|
# these files may be written in non english words
|
||||||
|
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
|
||||||
|
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
|
||||||
|
"vllm/third_party/*"]
|
||||||
|
ignore-hidden = true
|
||||||
|
ignore-files = true
|
||||||
|
ignore-dot = true
|
||||||
|
ignore-vcs = true
|
||||||
|
ignore-global = true
|
||||||
|
ignore-parent = true
|
||||||
|
|
||||||
|
[default]
|
||||||
|
binary = false
|
||||||
|
check-filename = false
|
||||||
|
check-file = true
|
||||||
|
unicode = true
|
||||||
|
ignore-hex = true
|
||||||
|
identifier-leading-digits = false
|
||||||
|
locale = "en"
|
||||||
|
extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
|
||||||
|
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*",
|
||||||
|
".*ot.*", ".*[Tt]h[rR].*"]
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[default.extend-identifiers]
|
||||||
|
bbc5b7ede = "bbc5b7ede"
|
||||||
|
womens_doubles = "womens_doubles"
|
||||||
|
v_2nd = "v_2nd"
|
||||||
|
splitted_input = "splitted_input"
|
||||||
|
NOOPs = "NOOPs"
|
||||||
|
typ = "typ"
|
||||||
|
nin_shortcut = "nin_shortcut"
|
||||||
|
UperNetDecoder = "UperNetDecoder"
|
||||||
|
subtile = "subtile"
|
||||||
|
cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
|
||||||
|
SFOuput = "SFOuput"
|
||||||
|
# huggingface transformers repo uses these words
|
||||||
|
depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
|
||||||
|
DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
|
||||||
|
depthwise_seperable_CNN = "depthwise_seperable_CNN"
|
||||||
|
|
||||||
|
[default.extend-words]
|
||||||
|
iy = "iy"
|
||||||
|
tendencias = "tendencias"
|
||||||
|
# intel cpu features
|
||||||
|
tme = "tme"
|
||||||
|
dout = "dout"
|
||||||
|
Pn = "Pn"
|
||||||
|
arange = "arange"
|
||||||
|
|
||||||
|
[type.py]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[type.py.extend-identifiers]
|
||||||
|
arange = "arange"
|
||||||
|
NDArray = "NDArray"
|
||||||
|
EOFError = "EOFError"
|
||||||
|
|
||||||
|
[type.py.extend-words]
|
||||||
|
|
||||||
|
[type.cpp]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[type.cpp.extend-identifiers]
|
||||||
|
countr_one = "countr_one"
|
||||||
|
|
||||||
|
[type.cpp.extend-words]
|
||||||
|
|
||||||
|
[type.rust]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[type.rust.extend-identifiers]
|
||||||
|
flate2 = "flate2"
|
||||||
|
|
||||||
|
[type.rust.extend-words]
|
||||||
|
ser = "ser"
|
||||||
|
|
||||||
|
[type.lock]
|
||||||
|
extend-glob = []
|
||||||
|
check-file = false
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[type.lock.extend-identifiers]
|
||||||
|
|
||||||
|
[type.lock.extend-words]
|
||||||
|
|
||||||
|
[type.jl]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[type.jl.extend-identifiers]
|
||||||
|
|
||||||
|
[type.jl.extend-words]
|
||||||
|
modul = "modul"
|
||||||
|
egals = "egals"
|
||||||
|
usig = "usig"
|
||||||
|
egal = "egal"
|
||||||
|
|
||||||
|
[type.go]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[type.go.extend-identifiers]
|
||||||
|
flate = "flate"
|
||||||
|
|
||||||
|
[type.go.extend-words]
|
||||||
|
|
||||||
|
[type.css]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[type.css.extend-identifiers]
|
||||||
|
nd = "nd"
|
||||||
|
|
||||||
|
[type.css.extend-words]
|
||||||
|
|
||||||
|
[type.man]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[type.man.extend-identifiers]
|
||||||
|
Nd = "Nd"
|
||||||
|
|
||||||
|
[type.man.extend-words]
|
||||||
|
|
||||||
|
[type.cert]
|
||||||
|
extend-glob = []
|
||||||
|
check-file = false
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[type.cert.extend-identifiers]
|
||||||
|
|
||||||
|
[type.cert.extend-words]
|
||||||
|
|
||||||
|
[type.sh]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[type.sh.extend-identifiers]
|
||||||
|
stap = "stap"
|
||||||
|
ot = "ot"
|
||||||
|
|
||||||
|
[type.sh.extend-words]
|
||||||
|
|
||||||
|
[type.vimscript]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[type.vimscript.extend-identifiers]
|
||||||
|
windo = "windo"
|
||||||
|
|
||||||
|
[type.vimscript.extend-words]
|
||||||
@ -1550,10 +1550,10 @@ def moe_wna16_gemm(input: torch.Tensor, output: torch.Tensor,
|
|||||||
|
|
||||||
|
|
||||||
def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
|
def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
|
||||||
token_expert_indicies: torch.Tensor,
|
token_expert_indices: torch.Tensor,
|
||||||
gating_output: torch.Tensor) -> None:
|
gating_output: torch.Tensor) -> None:
|
||||||
torch.ops._moe_C.topk_softmax(topk_weights, topk_ids,
|
torch.ops._moe_C.topk_softmax(topk_weights, topk_ids, token_expert_indices,
|
||||||
token_expert_indicies, gating_output)
|
gating_output)
|
||||||
|
|
||||||
|
|
||||||
def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor],
|
def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor],
|
||||||
|
|||||||
@ -373,7 +373,7 @@ class CommonAttentionState(AttentionState):
|
|||||||
f"Expected attn_backend name to be either 'XFORMERS'," \
|
f"Expected attn_backend name to be either 'XFORMERS'," \
|
||||||
f"'ROCM_FLASH', or 'FLASH_ATTN', but " \
|
f"'ROCM_FLASH', or 'FLASH_ATTN', but " \
|
||||||
f"got '{self.runner.attn_backend.get_name()}'"
|
f"got '{self.runner.attn_backend.get_name()}'"
|
||||||
self._add_additonal_input_buffers_for_enc_dec_model(
|
self._add_additional_input_buffers_for_enc_dec_model(
|
||||||
attn_metadata=attn_metadata, input_buffers=input_buffers)
|
attn_metadata=attn_metadata, input_buffers=input_buffers)
|
||||||
return input_buffers
|
return input_buffers
|
||||||
|
|
||||||
@ -427,7 +427,7 @@ class CommonAttentionState(AttentionState):
|
|||||||
attn_metadata.max_encoder_seq_len = self.runner.max_seq_len_to_capture
|
attn_metadata.max_encoder_seq_len = self.runner.max_seq_len_to_capture
|
||||||
attn_metadata.num_encoder_tokens = 0
|
attn_metadata.num_encoder_tokens = 0
|
||||||
|
|
||||||
def _add_additonal_input_buffers_for_enc_dec_model(
|
def _add_additional_input_buffers_for_enc_dec_model(
|
||||||
self, attn_metadata, input_buffers: Dict[str, Any]):
|
self, attn_metadata, input_buffers: Dict[str, Any]):
|
||||||
"""
|
"""
|
||||||
Saves additional input buffers specific to the encoder-decoder model
|
Saves additional input buffers specific to the encoder-decoder model
|
||||||
|
|||||||
@ -40,7 +40,7 @@ class Internlm2ToolParser(ToolParser):
|
|||||||
request.skip_special_tokens = False
|
request.skip_special_tokens = False
|
||||||
return request
|
return request
|
||||||
|
|
||||||
def get_argments(self, obj):
|
def get_arguments(self, obj):
|
||||||
if "parameters" in obj:
|
if "parameters" in obj:
|
||||||
return obj.get("parameters")
|
return obj.get("parameters")
|
||||||
elif "arguments" in obj:
|
elif "arguments" in obj:
|
||||||
@ -119,9 +119,9 @@ class Internlm2ToolParser(ToolParser):
|
|||||||
# now we know we're on the same tool call and we're streaming
|
# now we know we're on the same tool call and we're streaming
|
||||||
# arguments
|
# arguments
|
||||||
else:
|
else:
|
||||||
prev_arguments = self.get_argments(
|
prev_arguments = self.get_arguments(
|
||||||
self.prev_tool_call_arr[self.current_tool_id])
|
self.prev_tool_call_arr[self.current_tool_id])
|
||||||
cur_arguments = self.get_argments(tool_call_arr)
|
cur_arguments = self.get_arguments(tool_call_arr)
|
||||||
|
|
||||||
# not arguments generated
|
# not arguments generated
|
||||||
if not cur_arguments and not prev_arguments:
|
if not cur_arguments and not prev_arguments:
|
||||||
@ -170,7 +170,7 @@ class Internlm2ToolParser(ToolParser):
|
|||||||
# check to see if the name is defined and has been sent. if so,
|
# check to see if the name is defined and has been sent. if so,
|
||||||
# stream the name - otherwise keep waiting
|
# stream the name - otherwise keep waiting
|
||||||
# finish by setting old and returning None as base case
|
# finish by setting old and returning None as base case
|
||||||
tool_call_arr["arguments"] = self.get_argments(tool_call_arr)
|
tool_call_arr["arguments"] = self.get_arguments(tool_call_arr)
|
||||||
self.prev_tool_call_arr = [tool_call_arr]
|
self.prev_tool_call_arr = [tool_call_arr]
|
||||||
return delta
|
return delta
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|||||||
@ -1202,7 +1202,7 @@ class LinearScalingRotaryEmbeddingWithLoRA(BaseLayerWithLoRA):
|
|||||||
multiple LoRA adapters with a specialized kernel.
|
multiple LoRA adapters with a specialized kernel.
|
||||||
|
|
||||||
Replace LinearScalingRotaryEmbedding with MultiLinearScalingRotaryEmbedding
|
Replace LinearScalingRotaryEmbedding with MultiLinearScalingRotaryEmbedding
|
||||||
which can handle multi lora adapters in a specialied kernel.
|
which can handle multi lora adapters in a specialized kernel.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, base_layer: RotaryEmbedding) -> None:
|
def __init__(self, base_layer: RotaryEmbedding) -> None:
|
||||||
|
|||||||
@ -68,11 +68,11 @@ def convert_mapping(
|
|||||||
LoRA indices.
|
LoRA indices.
|
||||||
sampler_indices: Tensor of shape [batch_size] mapping requests to
|
sampler_indices: Tensor of shape [batch_size] mapping requests to
|
||||||
LoRA indices for sampler. For generation, this will be the
|
LoRA indices for sampler. For generation, this will be the
|
||||||
same as base_indicies. For prefill, this will map requests
|
same as base_indices. For prefill, this will map requests
|
||||||
to LoRA indices.
|
to LoRA indices.
|
||||||
sampler_indices_padded: Tensor of shape [batch_size] mapping
|
sampler_indices_padded: Tensor of shape [batch_size] mapping
|
||||||
requests to LoRA indices for sampler with padding.
|
requests to LoRA indices for sampler with padding.
|
||||||
Same as sampler_indicies, but -1 is replaced with
|
Same as sampler_indices, but -1 is replaced with
|
||||||
max_loras.
|
max_loras.
|
||||||
embeddings_indices: Tensor of shape [2, batch_size] mapping
|
embeddings_indices: Tensor of shape [2, batch_size] mapping
|
||||||
requests to embedding indices. First row is for embeddings
|
requests to embedding indices. First row is for embeddings
|
||||||
|
|||||||
@ -319,7 +319,7 @@ class MambaMixer2(CustomOp):
|
|||||||
n_groups == 1, # if there was only one group
|
n_groups == 1, # if there was only one group
|
||||||
)
|
)
|
||||||
intermediate_settings = (intermediate_size, 0, False)
|
intermediate_settings = (intermediate_size, 0, False)
|
||||||
head_setings = (self.num_heads, 0, False)
|
head_settings = (self.num_heads, 0, False)
|
||||||
|
|
||||||
# - the weight already has a "weight_loader" attribute
|
# - the weight already has a "weight_loader" attribute
|
||||||
# which set_weight_attrs will raise if we do not
|
# which set_weight_attrs will raise if we do not
|
||||||
@ -372,7 +372,7 @@ class MambaMixer2(CustomOp):
|
|||||||
intermediate_settings,
|
intermediate_settings,
|
||||||
group_shard_settings,
|
group_shard_settings,
|
||||||
group_shard_settings,
|
group_shard_settings,
|
||||||
head_setings, # for dt
|
head_settings, # for dt
|
||||||
],
|
],
|
||||||
self.tp_size,
|
self.tp_size,
|
||||||
tp_rank,
|
tp_rank,
|
||||||
|
|||||||
@ -516,7 +516,7 @@ def _chunk_state_varlen_kernel(
|
|||||||
offs_n[None, :] * stride_chunk_states_dstate)
|
offs_n[None, :] * stride_chunk_states_dstate)
|
||||||
else:
|
else:
|
||||||
|
|
||||||
# - this seems repetitve, buts its to help the compiler
|
# - this seems repetitive, buts its to help the compiler
|
||||||
if start_idx < pid_c * chunk_size:
|
if start_idx < pid_c * chunk_size:
|
||||||
past_states_ptrs = chunk_states_ptr + (
|
past_states_ptrs = chunk_states_ptr + (
|
||||||
offs_m[:, None] * stride_chunk_states_hdim +
|
offs_m[:, None] * stride_chunk_states_hdim +
|
||||||
|
|||||||
@ -219,7 +219,7 @@ def per_token_group_quant_int8(
|
|||||||
quantized tensor along with the scaling factor used for quantization.
|
quantized tensor along with the scaling factor used for quantization.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
x: The input tenosr with ndim >= 2.
|
x: The input tensor with ndim >= 2.
|
||||||
group_size: The group size used for quantization.
|
group_size: The group size used for quantization.
|
||||||
eps: The minimum to avoid dividing zero.
|
eps: The minimum to avoid dividing zero.
|
||||||
dtype: The dype of output tensor. Note that only `torch.int8`
|
dtype: The dype of output tensor. Note that only `torch.int8`
|
||||||
|
|||||||
@ -401,7 +401,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
|||||||
self.target_modules.append(
|
self.target_modules.append(
|
||||||
name.replace(rep_name, sub_name))
|
name.replace(rep_name, sub_name))
|
||||||
# Add original module name even if the module has stacked map,
|
# Add original module name even if the module has stacked map,
|
||||||
# in case model has a mixture of disk-merged and disk-splitted
|
# in case model has a mixture of disk-merged and disk-split
|
||||||
# weights with same last name.
|
# weights with same last name.
|
||||||
self.target_modules.append(name)
|
self.target_modules.append(name)
|
||||||
|
|
||||||
|
|||||||
@ -131,7 +131,7 @@ class BaiChuanAttention(nn.Module):
|
|||||||
self.num_heads = (self.total_num_heads //
|
self.num_heads = (self.total_num_heads //
|
||||||
tensor_model_parallel_world_size)
|
tensor_model_parallel_world_size)
|
||||||
self.head_dim = hidden_size // self.total_num_heads
|
self.head_dim = hidden_size // self.total_num_heads
|
||||||
self.postion_embedding = position_embedding
|
self.position_embedding = position_embedding
|
||||||
self.rope_theta = rope_theta
|
self.rope_theta = rope_theta
|
||||||
self.max_position_embeddings = max_position_embeddings
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
|
||||||
@ -151,7 +151,7 @@ class BaiChuanAttention(nn.Module):
|
|||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
# Create the alibi slopes and slice them.
|
# Create the alibi slopes and slice them.
|
||||||
if self.postion_embedding == "ALIBI":
|
if self.position_embedding == "ALIBI":
|
||||||
tp_rank = get_tensor_model_parallel_rank()
|
tp_rank = get_tensor_model_parallel_rank()
|
||||||
head_start = tp_rank * self.num_heads
|
head_start = tp_rank * self.num_heads
|
||||||
head_end = (tp_rank + 1) * self.num_heads
|
head_end = (tp_rank + 1) * self.num_heads
|
||||||
@ -187,7 +187,7 @@ class BaiChuanAttention(nn.Module):
|
|||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
qkv, _ = self.W_pack(hidden_states)
|
qkv, _ = self.W_pack(hidden_states)
|
||||||
q, k, v = qkv.chunk(chunks=3, dim=-1)
|
q, k, v = qkv.chunk(chunks=3, dim=-1)
|
||||||
if self.postion_embedding != "ALIBI":
|
if self.position_embedding != "ALIBI":
|
||||||
q, k = self.rotary_emb(positions, q, k)
|
q, k = self.rotary_emb(positions, q, k)
|
||||||
attn_output = self.attn(q, k, v)
|
attn_output = self.attn(q, k, v)
|
||||||
output, _ = self.o_proj(attn_output)
|
output, _ = self.o_proj(attn_output)
|
||||||
|
|||||||
@ -344,7 +344,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
|||||||
self.image_newline = nn.Parameter(
|
self.image_newline = nn.Parameter(
|
||||||
torch.randn(self.projector_config.n_embed) * embed_std)
|
torch.randn(self.projector_config.n_embed) * embed_std)
|
||||||
# This is a typo in original implementation
|
# This is a typo in original implementation
|
||||||
self.view_seperator = nn.Parameter(
|
self.view_separator = nn.Parameter(
|
||||||
torch.randn(self.projector_config.n_embed) * embed_std)
|
torch.randn(self.projector_config.n_embed) * embed_std)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -549,13 +549,13 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
|||||||
if self.global_view_pos == "head":
|
if self.global_view_pos == "head":
|
||||||
global_local_features = torch.cat([
|
global_local_features = torch.cat([
|
||||||
global_features,
|
global_features,
|
||||||
self.view_seperator[None, :],
|
self.view_separator[None, :],
|
||||||
local_features,
|
local_features,
|
||||||
])
|
])
|
||||||
else:
|
else:
|
||||||
global_local_features = torch.cat([
|
global_local_features = torch.cat([
|
||||||
local_features,
|
local_features,
|
||||||
self.view_seperator[None, :],
|
self.view_separator[None, :],
|
||||||
global_features,
|
global_features,
|
||||||
])
|
])
|
||||||
|
|
||||||
|
|||||||
@ -197,7 +197,7 @@ class EAGLE(nn.Module):
|
|||||||
return logits
|
return logits
|
||||||
|
|
||||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||||
# This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
|
# This implementation is incompatible with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
|
||||||
# due to missing lm_head weights and its config being that of a
|
# due to missing lm_head weights and its config being that of a
|
||||||
# Llama model. Here's a compatible version with the same weights:
|
# Llama model. Here's a compatible version with the same weights:
|
||||||
# https://huggingface.co/abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm
|
# https://huggingface.co/abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm
|
||||||
|
|||||||
@ -634,13 +634,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
|
|||||||
kwargs["has_images"] = True
|
kwargs["has_images"] = True
|
||||||
# NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
|
# NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
|
||||||
# This is a HACK. Fix this.
|
# This is a HACK. Fix this.
|
||||||
start_idices = (positions == 0).cpu().nonzero()
|
start_indices = (positions == 0).cpu().nonzero()
|
||||||
num_seqs = len(start_idices)
|
num_seqs = len(start_indices)
|
||||||
seq_lens = []
|
seq_lens = []
|
||||||
for i in range(num_seqs):
|
for i in range(num_seqs):
|
||||||
start_idx = start_idices[i].item()
|
start_idx = start_indices[i].item()
|
||||||
if i < num_seqs - 1:
|
if i < num_seqs - 1:
|
||||||
end_idx = start_idices[i + 1].item()
|
end_idx = start_indices[i + 1].item()
|
||||||
else:
|
else:
|
||||||
end_idx = len(input_ids)
|
end_idx = len(input_ids)
|
||||||
seq_lens.append(end_idx - start_idx)
|
seq_lens.append(end_idx - start_idx)
|
||||||
|
|||||||
@ -52,7 +52,7 @@ class Llama4MoE(nn.Module):
|
|||||||
renormalize: bool,
|
renormalize: bool,
|
||||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
router_scores, router_indices = fast_topk(gating_output, topk, dim=-1)
|
router_scores, router_indices = fast_topk(gating_output, topk, dim=-1)
|
||||||
# psuedo-standard is that the router scores are floats
|
# pseudo-standard is that the router scores are floats
|
||||||
router_scores = torch.sigmoid(router_scores.float())
|
router_scores = torch.sigmoid(router_scores.float())
|
||||||
return (router_scores, router_indices.to(torch.int32))
|
return (router_scores, router_indices.to(torch.int32))
|
||||||
|
|
||||||
|
|||||||
@ -114,9 +114,9 @@ class MixtralMoE(nn.Module):
|
|||||||
f"Tensor parallel size {self.tp_size} is greater than "
|
f"Tensor parallel size {self.tp_size} is greater than "
|
||||||
f"the number of experts {self.num_total_experts}.")
|
f"the number of experts {self.num_total_experts}.")
|
||||||
# Split experts equally between ranks
|
# Split experts equally between ranks
|
||||||
self.expert_indicies = np.array_split(range(
|
self.expert_indices = np.array_split(range(self.num_total_experts),
|
||||||
self.num_total_experts), self.tp_size)[self.rank].tolist()
|
self.tp_size)[self.rank].tolist()
|
||||||
if not self.expert_indicies:
|
if not self.expert_indices:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Rank {self.rank} has no experts assigned to it.")
|
f"Rank {self.rank} has no experts assigned to it.")
|
||||||
|
|
||||||
@ -125,7 +125,7 @@ class MixtralMoE(nn.Module):
|
|||||||
config.hidden_size,
|
config.hidden_size,
|
||||||
config.intermediate_size,
|
config.intermediate_size,
|
||||||
quant_config=quant_config)
|
quant_config=quant_config)
|
||||||
if idx in self.expert_indicies else None
|
if idx in self.expert_indices else None
|
||||||
for idx in range(self.num_total_experts)
|
for idx in range(self.num_total_experts)
|
||||||
])
|
])
|
||||||
self.gate = ReplicatedLinear(config.hidden_size,
|
self.gate = ReplicatedLinear(config.hidden_size,
|
||||||
@ -146,7 +146,7 @@ class MixtralMoE(nn.Module):
|
|||||||
routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
|
routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
|
||||||
|
|
||||||
final_hidden_states = None
|
final_hidden_states = None
|
||||||
for expert_idx in self.expert_indicies:
|
for expert_idx in self.expert_indices:
|
||||||
expert_layer = self.experts[expert_idx]
|
expert_layer = self.experts[expert_idx]
|
||||||
expert_mask = (selected_experts == expert_idx)
|
expert_mask = (selected_experts == expert_idx)
|
||||||
expert_weights = (routing_weights * expert_mask).sum(dim=-1,
|
expert_weights = (routing_weights * expert_mask).sum(dim=-1,
|
||||||
|
|||||||
@ -283,7 +283,7 @@ class OvisProcessingInfo(BaseProcessingInfo):
|
|||||||
def get_image_size_with_most_features(self) -> ImageSize:
|
def get_image_size_with_most_features(self) -> ImageSize:
|
||||||
height, width = self.get_hf_processor().get_image_size()
|
height, width = self.get_hf_processor().get_image_size()
|
||||||
hs = self.get_hf_config().visual_tokenizer_config.hidden_stride
|
hs = self.get_hf_config().visual_tokenizer_config.hidden_stride
|
||||||
# NOTE(Isotr0py): 9 is `max_partion` hardcoded in original code
|
# NOTE(Isotr0py): 9 is `max_partition` hardcoded in original code
|
||||||
# https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/modeling_ovis.py#L96
|
# https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/modeling_ovis.py#L96
|
||||||
return ImageSize(width=width * hs * 9, height=height * hs * 9)
|
return ImageSize(width=width * hs * 9, height=height * hs * 9)
|
||||||
|
|
||||||
|
|||||||
@ -145,7 +145,7 @@ class Phi3SmallSelfAttention(nn.Module):
|
|||||||
self.num_q_per_kv = self.num_heads // self.num_key_value_heads
|
self.num_q_per_kv = self.num_heads // self.num_key_value_heads
|
||||||
if self.tp_size > 1:
|
if self.tp_size > 1:
|
||||||
assert self.num_key_value_heads % self.tp_size == 0
|
assert self.num_key_value_heads % self.tp_size == 0
|
||||||
self.num_kv_heads_per_partion = max(
|
self.num_kv_heads_per_partition = max(
|
||||||
1, self.num_key_value_heads // self.tp_size)
|
1, self.num_key_value_heads // self.tp_size)
|
||||||
self.num_heads_per_partition = self.num_heads // self.tp_size
|
self.num_heads_per_partition = self.num_heads // self.tp_size
|
||||||
|
|
||||||
@ -212,7 +212,7 @@ class Phi3SmallSelfAttention(nn.Module):
|
|||||||
bs_params = {
|
bs_params = {
|
||||||
'max_seqlen': self.max_position_embeddings,
|
'max_seqlen': self.max_position_embeddings,
|
||||||
'num_heads': self.num_heads_per_partition,
|
'num_heads': self.num_heads_per_partition,
|
||||||
"num_kv_heads": self.num_kv_heads_per_partion,
|
"num_kv_heads": self.num_kv_heads_per_partition,
|
||||||
"block_size": self.sparse_block_size,
|
"block_size": self.sparse_block_size,
|
||||||
"local_blocks": self.local_blocks,
|
"local_blocks": self.local_blocks,
|
||||||
"vert_stride": self.vert_stride,
|
"vert_stride": self.vert_stride,
|
||||||
@ -222,7 +222,7 @@ class Phi3SmallSelfAttention(nn.Module):
|
|||||||
self.attn = Attention(self.num_heads_per_partition,
|
self.attn = Attention(self.num_heads_per_partition,
|
||||||
self.head_dim,
|
self.head_dim,
|
||||||
self.scale,
|
self.scale,
|
||||||
num_kv_heads=self.num_kv_heads_per_partion,
|
num_kv_heads=self.num_kv_heads_per_partition,
|
||||||
cache_config=cache_config,
|
cache_config=cache_config,
|
||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
blocksparse_params=bs_params,
|
blocksparse_params=bs_params,
|
||||||
@ -243,8 +243,8 @@ class Phi3SmallSelfAttention(nn.Module):
|
|||||||
# NOTE: this is required by RotaryEmbed, which indeed does not have to
|
# NOTE: this is required by RotaryEmbed, which indeed does not have to
|
||||||
# TODO: allow 3D QK for rotary forward
|
# TODO: allow 3D QK for rotary forward
|
||||||
q = q.reshape(-1, self.head_dim * self.num_heads_per_partition)
|
q = q.reshape(-1, self.head_dim * self.num_heads_per_partition)
|
||||||
k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
|
k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partition)
|
||||||
v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
|
v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partition)
|
||||||
|
|
||||||
q, k = self.rotary_emb(positions, q, k)
|
q, k = self.rotary_emb(positions, q, k)
|
||||||
attn_output = self.attn(q, k, v)
|
attn_output = self.attn(q, k, v)
|
||||||
|
|||||||
@ -126,7 +126,7 @@ class ConformerEncoderLayer(nn.Module):
|
|||||||
(Multi-Head Attention),
|
(Multi-Head Attention),
|
||||||
1 = typical Multi-Head Attention,
|
1 = typical Multi-Head Attention,
|
||||||
1 < attn_group_sizes < attention_heads = Grouped-Query Attention
|
1 < attn_group_sizes < attention_heads = Grouped-Query Attention
|
||||||
attn_group_sizes = attenion_heads = Multi-Query Attention
|
attn_group_sizes = attention_heads = Multi-Query Attention
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -318,7 +318,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
|
|||||||
1 = typical Multi-Head Attention,
|
1 = typical Multi-Head Attention,
|
||||||
1 < attention_group_size < attention_heads = Grouped-Query
|
1 < attention_group_size < attention_heads = Grouped-Query
|
||||||
Attention
|
Attention
|
||||||
attention_group_size = attenion_heads = Multi-Query Attention
|
attention_group_size = attention_heads = Multi-Query Attention
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -744,7 +744,7 @@ class ConformerEncoder(TransformerEncoderBase):
|
|||||||
1 = typical Multi-Head Attention,
|
1 = typical Multi-Head Attention,
|
||||||
1 < attention_group_size < attention_heads = Grouped-Query
|
1 < attention_group_size < attention_heads = Grouped-Query
|
||||||
Attention
|
Attention
|
||||||
attention_group_size = attenion_heads = Multi-Query Attention
|
attention_group_size = attention_heads = Multi-Query Attention
|
||||||
"""
|
"""
|
||||||
|
|
||||||
extra_multi_layer_output_idxs: list[int]
|
extra_multi_layer_output_idxs: list[int]
|
||||||
|
|||||||
@ -147,15 +147,15 @@ class mp(torch.autograd.Function):
|
|||||||
|
|
||||||
grad_at_output = grad_at_output * multiplier
|
grad_at_output = grad_at_output * multiplier
|
||||||
|
|
||||||
grad_at_scores_expaned = masked_gates * grad_at_output.mul(-1)
|
grad_at_scores_expanded = masked_gates * grad_at_output.mul(-1)
|
||||||
grad_at_scores_expaned.scatter_add_(
|
grad_at_scores_expanded.scatter_add_(
|
||||||
dim=-1,
|
dim=-1,
|
||||||
index=selected_experts,
|
index=selected_experts,
|
||||||
src=grad_at_output,
|
src=grad_at_output,
|
||||||
)
|
)
|
||||||
|
|
||||||
return (
|
return (
|
||||||
grad_at_scores_expaned,
|
grad_at_scores_expanded,
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
|
|||||||
@ -324,7 +324,7 @@ def merge_and_sort_multimodal_metadata(
|
|||||||
Returns:
|
Returns:
|
||||||
list[str]: List of item modalities in order of their positions in the
|
list[str]: List of item modalities in order of their positions in the
|
||||||
input sequence.
|
input sequence.
|
||||||
list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
|
list[PlaceholderRange]: Sorted list of all PlaceholderRanges from
|
||||||
mm_positions.
|
mm_positions.
|
||||||
Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
|
Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
|
||||||
None otherwise.
|
None otherwise.
|
||||||
|
|||||||
@ -68,7 +68,7 @@ class OvisProcessor(ProcessorMixin):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
attributes = ["image_processor", "tokenizer"]
|
attributes = ["image_processor", "tokenizer"]
|
||||||
valid_kwargs = ["chat_template", "image_pad_token", "image_segement_len"]
|
valid_kwargs = ["chat_template", "image_pad_token", "image_segment_len"]
|
||||||
|
|
||||||
image_processor_class = "AutoImageProcessor"
|
image_processor_class = "AutoImageProcessor"
|
||||||
tokenizer_class = "AutoTokenizer"
|
tokenizer_class = "AutoTokenizer"
|
||||||
|
|||||||
@ -886,7 +886,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
|
|||||||
num_decode_tokens=0,
|
num_decode_tokens=0,
|
||||||
slot_mapping=slot_mapping,
|
slot_mapping=slot_mapping,
|
||||||
multi_modal_placeholder_index_maps=
|
multi_modal_placeholder_index_maps=
|
||||||
None, # FIXME(kzawora): mutli-modality will not work here
|
None, # FIXME(kzawora): multi-modality will not work here
|
||||||
enable_kv_scales_calculation=False,
|
enable_kv_scales_calculation=False,
|
||||||
)
|
)
|
||||||
multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
|
multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
|
||||||
|
|||||||
@ -277,7 +277,7 @@ class StatefulModelInput(BroadcastableModelInput):
|
|||||||
assert fmi.input_tokens.shape[0] >= self.num_seqs
|
assert fmi.input_tokens.shape[0] >= self.num_seqs
|
||||||
fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs]
|
fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs]
|
||||||
|
|
||||||
# Update frozen_model_input::input_positons.
|
# Update frozen_model_input::input_positions.
|
||||||
assert fmi.input_positions is not None
|
assert fmi.input_positions is not None
|
||||||
assert fmi.input_positions.shape[0] >= self.num_seqs
|
assert fmi.input_positions.shape[0] >= self.num_seqs
|
||||||
fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self.
|
fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self.
|
||||||
|
|||||||
@ -798,9 +798,9 @@ class ModelWrapper(nn.Module):
|
|||||||
"""
|
"""
|
||||||
batch_size, seq_len = token_ids.shape
|
batch_size, seq_len = token_ids.shape
|
||||||
# Calculate the positions to sample from.
|
# Calculate the positions to sample from.
|
||||||
start_indicies = torch.arange(
|
start_indices = torch.arange(
|
||||||
batch_size, dtype=torch.int32, device=input_lens.device) * seq_len
|
batch_size, dtype=torch.int32, device=input_lens.device) * seq_len
|
||||||
logits_indices = start_indicies + input_lens - 1
|
logits_indices = start_indices + input_lens - 1
|
||||||
attn_metadata = get_forward_context().attn_metadata
|
attn_metadata = get_forward_context().attn_metadata
|
||||||
|
|
||||||
# FIXME(woosuk): This is a temporary hack to avoid using the existing
|
# FIXME(woosuk): This is a temporary hack to avoid using the existing
|
||||||
@ -822,14 +822,14 @@ class ModelWrapper(nn.Module):
|
|||||||
num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape
|
num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape
|
||||||
slot_mapping = attn_metadata.slot_mapping
|
slot_mapping = attn_metadata.slot_mapping
|
||||||
slot_mapping = slot_mapping.flatten()
|
slot_mapping = slot_mapping.flatten()
|
||||||
head_indicies = torch.arange(0,
|
head_indices = torch.arange(0,
|
||||||
num_kv_heads,
|
num_kv_heads,
|
||||||
device=slot_mapping.device,
|
device=slot_mapping.device,
|
||||||
dtype=slot_mapping.dtype)
|
dtype=slot_mapping.dtype)
|
||||||
head_indicies *= block_size * num_blocks
|
head_indices *= block_size * num_blocks
|
||||||
slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view(
|
slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view(
|
||||||
-1, num_kv_heads)
|
-1, num_kv_heads)
|
||||||
slot_mapping = slot_mapping + head_indicies.view(1, -1)
|
slot_mapping = slot_mapping + head_indices.view(1, -1)
|
||||||
slot_mapping = slot_mapping.flatten()
|
slot_mapping = slot_mapping.flatten()
|
||||||
attn_metadata.slot_mapping = slot_mapping
|
attn_metadata.slot_mapping = slot_mapping
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user