From 4069db3f2e3fad65251ca5ed659c83ba48894ef2 Mon Sep 17 00:00:00 2001 From: roikoren755 <26850796+roikoren755@users.noreply.github.com> Date: Thu, 9 Oct 2025 20:59:41 +0300 Subject: [PATCH] [Bugfix] Enable padded FP4 quantization (#25947) Signed-off-by: Roi Koren --- vllm/_custom_ops.py | 2 +- vllm/utils/flashinfer.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 68ef9b96bbceb..b2a7f8e808a4f 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1384,7 +1384,7 @@ def scaled_fp4_quant( rounded_m = round_up(m, 128) scale_n = n // block_size rounded_n = round_up(scale_n, 4) - output_scale = torch.empty( + output_scale = torch.zeros( (rounded_m, rounded_n // 4), device=device, dtype=torch.int32 ) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 159d19bfad31b..78d4a92dc1af1 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -386,8 +386,6 @@ def flashinfer_scaled_fp4_mm( assert block_scale_a.ndim == 2 and block_scale_b.ndim == 2 assert a.stride(-1) == 1 and b.stride(-1) == 1 assert a.shape[1] == b.shape[1] - assert block_scale_a.shape[1] == a.shape[1] // 8 - assert block_scale_b.shape[1] == b.shape[1] // 8 if backend == "cutlass": block_scale_a = block_scale_a.view(torch.uint8)