From cbfcff373b2d45b918f2d1e5eb721504a2cb7bcf Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 18 Nov 2025 07:22:05 +0000 Subject: [PATCH] add dynamic per tensor fallback Signed-off-by: vllmellm --- vllm/model_executor/layers/quantization/fp8.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 67e5b65de6010..225ed9499fd4d 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -77,6 +77,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, is_layer_skipped, + kFp8DynamicTensorSym, kFp8DynamicTokenSym, kFp8StaticTensorSym, ) @@ -381,9 +382,12 @@ class Fp8LinearMethod(LinearMethodBase): if not self.act_q_static and cutlass_fp8_supported(): self.act_q_group_shape = GroupShape.PER_TOKEN self.activation_quant_key = kFp8DynamicTokenSym - else: + elif self.act_q_static: self.act_q_group_shape = GroupShape.PER_TENSOR self.activation_quant_key = kFp8StaticTensorSym + else: + self.act_q_group_shape = GroupShape.PER_TENSOR + self.activation_quant_key = kFp8DynamicTensorSym if self.block_quant: assert not self.act_q_static