From 136d750f5f421ca5be2e24b0a913e813d99bb831 Mon Sep 17 00:00:00 2001
From: czhu-cohere <conway.zhu@cohere.com>
Date: Fri, 25 Jul 2025 06:53:21 -0700
Subject: [PATCH] [Kernel] Improve machete memory bound perf (#21556)

Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
---
 csrc/quantization/machete/machete_prepacked_layout.cuh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/csrc/quantization/machete/machete_prepacked_layout.cuh b/csrc/quantization/machete/machete_prepacked_layout.cuh
index 81aaa6c4f3a28..4a7d6341e6c00 100644
--- a/csrc/quantization/machete/machete_prepacked_layout.cuh
+++ b/csrc/quantization/machete/machete_prepacked_layout.cuh
@@ -187,8 +187,12 @@ struct PrepackedLayoutBTemplate {
   CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy(
       Shape_NKL shape_mkl) {
     auto layout = TVbNbKL_to_offset(shape_mkl);
-    return make_layout(coalesce(get<0>(layout)), get<1>(layout),
-                       get<2>(layout));
+    // for 4-bit elements, having >= 64 values per column
+    // allows TMA to load full 32-byte sectors
+    auto inner_layout =
+        make_layout(make_shape(_256{}, size<0>(layout) / _256{}));
+
+    return make_layout(inner_layout, get<1>(layout), get<2>(layout));
   }
 
   // ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx)