From fa0050db08660535368ec5ea41d313bdeb69909d Mon Sep 17 00:00:00 2001 From: Elfie Guo <164945471+elfiegg@users.noreply.github.com> Date: Wed, 15 Jan 2025 20:31:27 -0800 Subject: [PATCH] [Core] Default to using per_token quantization for fp8 when cutlass is supported. (#8651) Signed-off-by: mgoin Co-authored-by: Michael Goin Co-authored-by: mgoin --- vllm/model_executor/layers/quantization/fp8.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index a1be45a49e94a..4969ee559522e 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -355,7 +355,8 @@ class Fp8LinearMethod(LinearMethodBase): input_scale=layer.input_scale, bias=bias, cutlass_fp8_supported=self.cutlass_fp8_supported, - use_per_token_if_dynamic=False) + # Default to using per_token quantization if cutlass is supported + use_per_token_if_dynamic=self.cutlass_fp8_supported) class Fp8MoEMethod(FusedMoEMethodBase):