From abdfcd4f3dc21dc162baf6887f658fb0f2f3d783 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Elvir=20Crn=C4=8Devi=C4=87?= <elvircrn@gmail.com>
Date: Thu, 18 Sep 2025 12:25:12 +0200
Subject: [PATCH] silu-v1: Fix EPS not being used during max-reduction (#25069)

Signed-off-by: elvircrn <elvircrn@gmail.com>
---
 csrc/quantization/activation_kernels.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu
index 9ddb5af3052f..9aa1411b4a25 100644
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@@ -365,7 +365,6 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
   int32_t compute_pipeline_offset_64 = 0;
 
   for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
-    __nv_bfloat16 y_max_bf16 = EPS;
     __nv_bfloat162 results_bf162[2];
 
     cp_async_wait<NUM_STAGES - 2>();
@@ -405,7 +404,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
     auto _y_max2 =
         __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
 
-    y_max_bf16 = __hmax(_y_max2.x, _y_max2.y);
+    __nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));
 
     // An entire group is assigned to a single warp, so a simple warp reduce
     // is used.