[FIX] FP4 quantization kernel padding initialization bug (#31097)

Signed-off-by: <>
Co-authored-by: root <root@gpu-193.slurm-workers-slurm.slurm.svc.cluster.local>
Co-authored-by: root <root@gpu-951.slurm-workers-slurm.slurm.svc.cluster.local>
This commit is contained in:
danielafrimi 2025-12-23 18:45:18 +02:00 committed by GitHub
parent 38c361f99d
commit b94f80ffb8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -35,7 +35,7 @@ template <typename Int>
__host__ __device__ inline Int round_up(Int x, Int y) { __host__ __device__ inline Int round_up(Int x, Int y) {
static_assert(std::is_integral_v<Int>, static_assert(std::is_integral_v<Int>,
"round_up argument must be integral type"); "round_up argument must be integral type");
return (x + y - 1) / y * y; return ((x + y - 1) / y) * y;
} }
// Compute effective rows for grid configuration with swizzled SF layouts. // Compute effective rows for grid configuration with swizzled SF layouts.
@ -61,37 +61,47 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
int sf_m = round_up<int>(numRows, 128); int sf_m = round_up<int>(numRows, 128);
int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE; int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4; int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) { int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;
// Each thread writes 4 uint32_t elements.
for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
col += blockDim.x * 4) {
SFout[row * sf_n_int + col] = 0x00;
}
}
// Get the global scaling factor, which will be applied to the SF. // Get the global scaling factor, which will be applied to the SF.
// Note SFScale is the same as next GEMM's alpha, which is // Note SFScale is the same as next GEMM's alpha, which is
// (448.f / (Alpha_A / 6.f)). // (448.f / (Alpha_A / 6.f)).
float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0]; float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
// Input tensor row/col loops. // Iterate over all rows and cols including padded ones -
for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) { // ensures we visit every single scale factor address to initialize it.
for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD; for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
for (int colIdx = threadIdx.x;
colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
colIdx += blockDim.x) { colIdx += blockDim.x) {
int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
PackedVec in_vec;
int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx; int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
// Get the output tensor offset. // If we are outside valid rows OR outside valid columns -> Use Zeros
// Same as inOffset because 8 elements are packed into one uint32_t. if (rowIdx >= numRows || elem_idx >= numCols) {
int64_t outOffset = inOffset; memset(&in_vec, 0, sizeof(PackedVec));
auto& out_pos = out[outOffset];
} else {
// Valid Region: Load actual data
in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
}
auto sf_out = auto sf_out =
cvt_quant_to_fp4_get_sf_out_offset<uint32_t, cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
CVT_FP4_NUM_THREADS_PER_SF>( CVT_FP4_NUM_THREADS_PER_SF>(
rowIdx, colIdx, numKTiles, SFout); rowIdx, colIdx, numKTiles, SFout);
out_pos = auto out_val =
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out); cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
// We do NOT write output for padding because the 'out' tensor is not
// padded.
if (rowIdx < numRows && elem_idx < numCols) {
// Same as inOffset because 8 elements are packed into one uint32_t.
out[inOffset] = out_val;
}
} }
} }
} }