From a3319f4f04fbea7defe883e516df727711e516cd Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 12 Jun 2025 15:39:15 -0400 Subject: [PATCH] [Bugfix] Enforce contiguous input for dynamic_per_token FP8/INT8 quant (#19452) Signed-off-by: mgoin --- vllm/_custom_ops.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index e26c90bf70cbe..9dbd0663eeff5 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1270,7 +1270,7 @@ def scaled_fp8_quant( device=input.device, dtype=torch.float32) torch.ops._C.dynamic_per_token_scaled_fp8_quant( - output, input, scale, scale_ub) + output, input.contiguous(), scale, scale_ub) else: scale = torch.zeros(1, device=input.device, dtype=torch.float32) torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale) @@ -1379,8 +1379,8 @@ def scaled_int8_quant( dtype=torch.float32) input_azp = None if symmetric else torch.empty_like(input_scales, dtype=torch.int32) - torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales, - input_azp) + torch.ops._C.dynamic_scaled_int8_quant(output, input.contiguous(), + input_scales, input_azp) return output, input_scales, input_azp