[Hybrid] [Kernel] Fix chunk scan kernel when BLOCK_SIZE_DSTATE > 128 (#28295)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
This commit is contained in:
Thomas Parnell 2025-11-14 23:55:42 +01:00 committed by GitHub
parent bf3ffb61e6
commit e0c910bb89
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -245,7 +245,7 @@ def _chunk_scan_fwd_kernel(
)
if not HAS_INITSTATES and (seq_idx != seq_idx_prev):
prev_states = tl.zeros(
(BLOCK_SIZE_DSTATE, BLOCK_SIZE_K), dtype=C_ptr.dtype.element_ty
(BLOCK_SIZE_K, BLOCK_SIZE_N), dtype=C_ptr.dtype.element_ty
)
else:
prev_states = tl.load(