mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-24 20:05:01 +08:00
[Core] Default to using per_token quantization for fp8 when cutlass is supported. (#8651)
Signed-off-by: mgoin <michael@neuralmagic.com> Co-authored-by: Michael Goin <mgoin@redhat.com> Co-authored-by: mgoin <michael@neuralmagic.com>
This commit is contained in:
parent
cd9d06fb8d
commit
fa0050db08
@ -355,7 +355,8 @@ class Fp8LinearMethod(LinearMethodBase):
|
||||
input_scale=layer.input_scale,
|
||||
bias=bias,
|
||||
cutlass_fp8_supported=self.cutlass_fp8_supported,
|
||||
use_per_token_if_dynamic=False)
|
||||
# Default to using per_token quantization if cutlass is supported
|
||||
use_per_token_if_dynamic=self.cutlass_fp8_supported)
|
||||
|
||||
|
||||
class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user