vllm/csrc/cpu/dnnl_helper.h
Li, Jiang 7be5d113d8
[CPU] Refactor CPU W8A8 scaled_mm (#23071)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-08-21 09:34:24 +08:00

170 lines
4.2 KiB
C++

#ifndef DNNL_HELPER_H
#define DNNL_HELPER_H
#include <optional>
#include <cassert>
#include "oneapi/dnnl/dnnl.hpp"
namespace c10 {
struct BFloat16;
struct Half;
} // namespace c10
namespace dnnl {
namespace impl {
struct memory_storage_t;
struct matmul_pd_t;
struct matmul_desc_t;
} // namespace impl
} // namespace dnnl
struct dnnl_memory_desc;
template <typename KT, typename VT>
class DNNLPrimitiveCache;
template <typename T>
struct DNNLType {
static constexpr dnnl::memory::data_type type =
dnnl::memory::data_type::undef;
};
template <>
struct DNNLType<int8_t> {
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
};
template <>
struct DNNLType<int32_t> {
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
};
template <>
struct DNNLType<float> {
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
};
template <>
struct DNNLType<c10::BFloat16> {
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
};
template <>
struct DNNLType<c10::Half> {
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
};
template <typename T>
constexpr inline dnnl::memory::data_type get_dnnl_type() {
return DNNLType<std::decay_t<T>>::type;
}
class DNNLMatMulPrimitiveHandler {
public:
virtual ~DNNLMatMulPrimitiveHandler() = default;
protected:
struct Args {
dnnl_dim_t b_n_size;
dnnl_dim_t b_n_stride;
dnnl_dim_t b_k_size;
dnnl_dim_t b_k_stride;
void* b_ptr;
dnnl::memory::data_type c_type;
size_t primitive_cache_size;
};
protected:
DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
void prepack_weight(void* original_b_ptr,
dnnl::memory::desc b_target_mem_desc);
void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);
std::pair<dnnl::impl::memory_storage_t*, dnnl_memory_desc*>
get_runtime_memory_ptr(size_t index);
protected:
const dnnl_dim_t b_n_size_;
const dnnl_dim_t b_n_stride_;
const dnnl_dim_t b_k_size_;
const dnnl_dim_t b_k_stride_;
dnnl::memory::data_type b_type_;
dnnl::memory::data_type c_type_;
std::unordered_map<int, dnnl::memory> memory_cache_;
std::vector<std::pair<dnnl::impl::memory_storage_t*, dnnl_memory_desc*>>
runtime_memory_ptrs_;
dnnl::memory::desc b_target_mem_desc_;
int64_t primitive_cache_size_;
};
class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
public:
enum class QuantizationStrategy { PER_TOKEN, PER_TENSOR, PER_OUTPUT_CHANNEL };
struct Args : public DNNLMatMulPrimitiveHandler::Args {
bool use_a_zero_point;
QuantizationStrategy a_quantization_strategy;
QuantizationStrategy b_quantization_strategy;
float* b_scales_ptr;
};
struct ClassMatmulCacheKey {
dnnl_dim_t b_n_size;
dnnl_dim_t b_k_size;
QuantizationStrategy a_qs;
QuantizationStrategy b_qs;
bool use_azp;
dnnl::memory::data_type c_type;
friend bool operator==(const ClassMatmulCacheKey& l,
const ClassMatmulCacheKey& r);
};
struct MSizeCacheKey {
dnnl_dim_t a_m_size;
bool use_bias;
dnnl::memory::data_type bias_type;
friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r);
};
using MSizeCache = DNNLPrimitiveCache<MSizeCacheKey, dnnl::matmul>;
using ClassMatmulCache =
DNNLPrimitiveCache<ClassMatmulCacheKey, std::shared_ptr<MSizeCache>>;
struct ExecArgs : public MSizeCacheKey {
const int8_t* a_ptr;
const float* a_scales_ptr;
const int32_t* a_zero_points_ptr;
const void* bias_ptr;
void* c_ptr;
};
public:
W8A8MatMulPrimitiveHandler(const Args& args);
QuantizationStrategy get_input_scale_strategy() const { return a_qs_; }
bool get_input_use_zero_point() const { return use_azp_; }
void execute(ExecArgs& args);
private:
dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key,
bool first_time);
void init_runtime_memory_cache(const Args& args);
dnnl::matmul get_matmul_cache(const MSizeCacheKey& key);
private:
const bool use_azp_;
const QuantizationStrategy a_qs_;
const QuantizationStrategy b_qs_;
std::shared_ptr<MSizeCache> m_size_cache_;
};
#endif