Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
This commit is contained in:
vllmellm 2025-10-31 16:38:26 +00:00
parent 5fbe76bc0a
commit e845035f4c
5 changed files with 7 additions and 2 deletions

View File

@ -146,6 +146,8 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
input_scale = create_fp8_input_scale(output_partition_sizes, weight_loader)
layer.register_parameter("input_scale", input_scale)
layer.register_parameter("input_scale_ub", None)
def process_weights_after_loading(self, layer) -> None:
if self.strategy == QuantizationStrategy.TENSOR:
weight, weight_scale, input_scale = process_fp8_weight_tensor_strategy(

View File

@ -21,7 +21,7 @@ from vllm.model_executor.layers.quantization.base_config import (
from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
init_fp8_linear_kernel,
)
from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (
from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501
ScaledMMLinearQuantStrategy,
)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (

View File

@ -451,6 +451,7 @@ class Fp8LinearMethod(LinearMethodBase):
weight_loader=weight_loader,
)
layer.register_parameter("weight", weight)
layer.register_parameter("input_scale_ub", None)
# If checkpoint is serialized fp8, load them.
# Otherwise, wait until process_weights_after_loading.

View File

@ -152,5 +152,5 @@ def init_fp8_linear_kernel(
return kernel_type(
scaled_mm_linear_kernel_config,
layer_param_names=["weight", "weight_scale", "input_scale"],
layer_param_names=["weight", "weight_scale", "input_scale", "input_scale_ub"],
)

View File

@ -172,6 +172,8 @@ class QuarkW8A8Fp8(QuarkScheme):
input_scale[:] = torch.finfo(torch.float32).min
layer.register_parameter("input_scale", input_scale)
layer.register_parameter("input_scale_ub", None)
weight_quant_strategy = QUANT_STRATEGY_MAP[self.weight_qscheme]
self.fp8_linear_kernel = init_fp8_linear_kernel(
act_q_static=self.is_static_input_scheme,