Fix num_token_padding support for static per-tensor scaled_fp8_quant (#20188)

Signed-off-by: mgoin <mgoin64@gmail.com>
2026-07-21 14:27:21 +08:00 · 2025-06-28 14:48:13 +09:00 · 2025-06-28 14:48:13 +09:00 · a29e62ea34
commit a29e62ea34
parent e53be6f00a
1 changed files with 1 additions and 2 deletions
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@ -1274,8 +1274,7 @@ def scaled_fp8_quant(
            scale = torch.zeros(1, device=input.device, dtype=torch.float32)
            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
    else:
-        # num_token_padding not implemented for this case
+        assert scale.numel() == 1
        assert (scale.numel() == 1 and num_token_padding is None)
        torch.ops._C.static_scaled_fp8_quant(output, input, scale)
    return output, scale