From 136d750f5f421ca5be2e24b0a913e813d99bb831 Mon Sep 17 00:00:00 2001 From: czhu-cohere Date: Fri, 25 Jul 2025 06:53:21 -0700 Subject: [PATCH] [Kernel] Improve machete memory bound perf (#21556) Signed-off-by: czhu-cohere --- csrc/quantization/machete/machete_prepacked_layout.cuh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/csrc/quantization/machete/machete_prepacked_layout.cuh b/csrc/quantization/machete/machete_prepacked_layout.cuh index 81aaa6c4f3a28..4a7d6341e6c00 100644 --- a/csrc/quantization/machete/machete_prepacked_layout.cuh +++ b/csrc/quantization/machete/machete_prepacked_layout.cuh @@ -187,8 +187,12 @@ struct PrepackedLayoutBTemplate { CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy( Shape_NKL shape_mkl) { auto layout = TVbNbKL_to_offset(shape_mkl); - return make_layout(coalesce(get<0>(layout)), get<1>(layout), - get<2>(layout)); + // for 4-bit elements, having >= 64 values per column + // allows TMA to load full 32-byte sectors + auto inner_layout = + make_layout(make_shape(_256{}, size<0>(layout) / _256{})); + + return make_layout(inner_layout, get<1>(layout), get<2>(layout)); } // ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)