mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 14:35:27 +08:00
[Kernel] Split Marlin MoE kernels into multiple files (#8661)
Co-authored-by: mgoin <michael@neuralmagic.com>
This commit is contained in:
parent
cc4325b66a
commit
a928ded995
@ -316,6 +316,11 @@ set(VLLM_MOE_EXT_SRC
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_MOE_EXT_SRC
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
|
||||
"csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
|
||||
"csrc/moe/marlin_moe_ops.cu")
|
||||
endif()
|
||||
|
||||
|
||||
1425
csrc/moe/marlin_kernels/marlin_moe_kernel.h
Normal file
1425
csrc/moe/marlin_kernels/marlin_moe_kernel.h
Normal file
File diff suppressed because it is too large
Load Diff
29
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu
Normal file
29
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu
Normal file
@ -0,0 +1,29 @@
|
||||
#include "marlin_moe_kernel_ku4b8.h"
|
||||
|
||||
namespace marlin_moe {
|
||||
|
||||
// We return bool so we can create these different kernel calls as a sequence
|
||||
// of if-elseif's.
|
||||
bool call_marlin_moe_kernel_ku4b8(
|
||||
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
|
||||
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||
const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
|
||||
int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
|
||||
int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
|
||||
bool replicate_input, bool apply_weights, int m_block, int max_par,
|
||||
int cfg_max_m_blocks) {
|
||||
if (false) {
|
||||
}
|
||||
GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
|
||||
GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
|
||||
GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
|
||||
GPTQ_CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace marlin_moe
|
||||
20
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h
Normal file
20
csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h
Normal file
@ -0,0 +1,20 @@
|
||||
#pragma once
|
||||
|
||||
#include "marlin_moe_kernel.h"
|
||||
|
||||
namespace marlin_moe {
|
||||
|
||||
// We return bool so we can create these different kernel calls as a sequence
|
||||
// of if-elseif's.
|
||||
bool call_marlin_moe_kernel_ku4b8(
|
||||
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
|
||||
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||
const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
|
||||
int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
|
||||
int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
|
||||
bool replicate_input, bool apply_weights, int m_block, int max_par,
|
||||
int cfg_max_m_blocks);
|
||||
|
||||
} // namespace marlin_moe
|
||||
29
csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu
Normal file
29
csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu
Normal file
@ -0,0 +1,29 @@
|
||||
#include "marlin_moe_kernel_ku8b128.h"
|
||||
|
||||
namespace marlin_moe {
|
||||
|
||||
// We return bool so we can create these different kernel calls as a sequence
|
||||
// of if-elseif's.
|
||||
bool call_marlin_moe_kernel_ku8b128(
|
||||
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
|
||||
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||
const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
|
||||
int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
|
||||
int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
|
||||
bool replicate_input, bool apply_weights, int m_block, int max_par,
|
||||
int cfg_max_m_blocks) {
|
||||
if (false) {
|
||||
}
|
||||
GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
|
||||
GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
|
||||
GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
|
||||
GPTQ_CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace marlin_moe
|
||||
18
csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h
Normal file
18
csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h
Normal file
@ -0,0 +1,18 @@
|
||||
#pragma once
|
||||
|
||||
#include "marlin_moe_kernel.h"
|
||||
|
||||
namespace marlin_moe {
|
||||
|
||||
bool call_marlin_moe_kernel_ku8b128(
|
||||
vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
|
||||
bool has_act_order, int group_blocks, int num_threads, int blocks,
|
||||
int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
|
||||
const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
|
||||
const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
|
||||
int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
|
||||
int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
|
||||
bool replicate_input, bool apply_weights, int m_block, int max_par,
|
||||
int cfg_max_m_blocks);
|
||||
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user