mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-06 03:57:02 +08:00
bug fix
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
This commit is contained in:
parent
5fbe76bc0a
commit
e845035f4c
@ -146,6 +146,8 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
|
||||
input_scale = create_fp8_input_scale(output_partition_sizes, weight_loader)
|
||||
layer.register_parameter("input_scale", input_scale)
|
||||
|
||||
layer.register_parameter("input_scale_ub", None)
|
||||
|
||||
def process_weights_after_loading(self, layer) -> None:
|
||||
if self.strategy == QuantizationStrategy.TENSOR:
|
||||
weight, weight_scale, input_scale = process_fp8_weight_tensor_strategy(
|
||||
|
||||
@ -21,7 +21,7 @@ from vllm.model_executor.layers.quantization.base_config import (
|
||||
from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
|
||||
init_fp8_linear_kernel,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (
|
||||
from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501
|
||||
ScaledMMLinearQuantStrategy,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
|
||||
|
||||
@ -451,6 +451,7 @@ class Fp8LinearMethod(LinearMethodBase):
|
||||
weight_loader=weight_loader,
|
||||
)
|
||||
layer.register_parameter("weight", weight)
|
||||
layer.register_parameter("input_scale_ub", None)
|
||||
|
||||
# If checkpoint is serialized fp8, load them.
|
||||
# Otherwise, wait until process_weights_after_loading.
|
||||
|
||||
@ -152,5 +152,5 @@ def init_fp8_linear_kernel(
|
||||
|
||||
return kernel_type(
|
||||
scaled_mm_linear_kernel_config,
|
||||
layer_param_names=["weight", "weight_scale", "input_scale"],
|
||||
layer_param_names=["weight", "weight_scale", "input_scale", "input_scale_ub"],
|
||||
)
|
||||
|
||||
@ -172,6 +172,8 @@ class QuarkW8A8Fp8(QuarkScheme):
|
||||
input_scale[:] = torch.finfo(torch.float32).min
|
||||
layer.register_parameter("input_scale", input_scale)
|
||||
|
||||
layer.register_parameter("input_scale_ub", None)
|
||||
|
||||
weight_quant_strategy = QUANT_STRATEGY_MAP[self.weight_qscheme]
|
||||
self.fp8_linear_kernel = init_fp8_linear_kernel(
|
||||
act_q_static=self.is_static_input_scheme,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user