mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-09 07:29:07 +08:00
[Perf] Tunings for SM100 FP8 CUTLASS kernel (#18778)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
d459fae0a2
commit
53a5a0ce30
@ -15,6 +15,7 @@ using c3x::cutlass_gemm_caller;
|
|||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType,
|
||||||
template <typename, typename, typename> typename Epilogue>
|
template <typename, typename, typename> typename Epilogue>
|
||||||
struct sm100_fp8_config_default {
|
struct sm100_fp8_config_default {
|
||||||
|
// M in (128, inf)
|
||||||
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
|
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
|
||||||
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
|
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
|
||||||
@ -25,6 +26,34 @@ struct sm100_fp8_config_default {
|
|||||||
KernelSchedule, EpilogueSchedule>;
|
KernelSchedule, EpilogueSchedule>;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename, typename> typename Epilogue>
|
||||||
|
struct sm100_fp8_config_M128 {
|
||||||
|
// M in (64, 128]
|
||||||
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
|
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
|
||||||
|
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
|
||||||
|
using TileShape = Shape<_128, _128, _64>;
|
||||||
|
using ClusterShape = Shape<_2, _2, _1>;
|
||||||
|
using Cutlass3xGemm =
|
||||||
|
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
|
||||||
|
KernelSchedule, EpilogueSchedule>;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename InType, typename OutType,
|
||||||
|
template <typename, typename, typename> typename Epilogue>
|
||||||
|
struct sm100_fp8_config_M64 {
|
||||||
|
// M in [1, 64]
|
||||||
|
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
|
||||||
|
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
|
||||||
|
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
|
||||||
|
using TileShape = Shape<_64, _64, _256>;
|
||||||
|
using ClusterShape = Shape<_1, _8, _1>;
|
||||||
|
using Cutlass3xGemm =
|
||||||
|
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
|
||||||
|
KernelSchedule, EpilogueSchedule>;
|
||||||
|
};
|
||||||
|
|
||||||
template <typename InType, typename OutType,
|
template <typename InType, typename OutType,
|
||||||
template <typename, typename, typename> typename Epilogue,
|
template <typename, typename, typename> typename Epilogue,
|
||||||
typename... EpilogueArgs>
|
typename... EpilogueArgs>
|
||||||
@ -39,8 +68,28 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
|
|||||||
using Cutlass3xGemmDefault =
|
using Cutlass3xGemmDefault =
|
||||||
typename sm100_fp8_config_default<InType, OutType,
|
typename sm100_fp8_config_default<InType, OutType,
|
||||||
Epilogue>::Cutlass3xGemm;
|
Epilogue>::Cutlass3xGemm;
|
||||||
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
|
using Cutlass3xGemmM64 =
|
||||||
out, a, b, std::forward<EpilogueArgs>(args)...);
|
typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
|
||||||
|
using Cutlass3xGemmM128 =
|
||||||
|
typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
|
||||||
|
|
||||||
|
uint32_t const m = a.size(0);
|
||||||
|
uint32_t const mp2 =
|
||||||
|
std::max(static_cast<uint32_t>(64), next_pow_2(m)); // next power of 2
|
||||||
|
|
||||||
|
if (mp2 <= 64) {
|
||||||
|
// m in [1, 64]
|
||||||
|
return cutlass_gemm_caller<Cutlass3xGemmM64>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else if (mp2 <= 128) {
|
||||||
|
// m in (64, 128]
|
||||||
|
return cutlass_gemm_caller<Cutlass3xGemmM128>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
} else {
|
||||||
|
// m in (128, inf)
|
||||||
|
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
|
||||||
|
out, a, b, std::forward<EpilogueArgs>(args)...);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <template <typename, typename, typename> typename Epilogue,
|
template <template <typename, typename, typename> typename Epilogue,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user