mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 01:15:26 +08:00
[Bug] Fix Layer weight_block_size Assertion Issue (#24674)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
7a30fa8708
commit
fcba05c435
@ -450,10 +450,10 @@ class Fp8LinearMethod(LinearMethodBase):
|
||||
# Activations not quantized for marlin.
|
||||
del layer.input_scale
|
||||
|
||||
# On B200, if E8M0 for DeepGemm is used, we need to
|
||||
# On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to
|
||||
# requantize the weight and input to the specific scale
|
||||
# at the same time.
|
||||
if is_deep_gemm_e8m0_used():
|
||||
if is_deep_gemm_e8m0_used() and self.block_quant:
|
||||
assert layer.weight_block_size is not None
|
||||
block_sz = tuple(layer.weight_block_size)
|
||||
requant_weight_ue8m0_inplace(
|
||||
@ -905,7 +905,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
del layer.w13_input_scale
|
||||
del layer.w2_input_scale
|
||||
|
||||
if is_deep_gemm_e8m0_used():
|
||||
if is_deep_gemm_e8m0_used() and self.block_quant:
|
||||
assert layer.weight_block_size is not None
|
||||
# Re-quantise the expert weights so their scales are UE8M0.
|
||||
block_sz = tuple(layer.weight_block_size)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user