update modelopt path

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2026-05-21 22:44:31 +08:00 · 2025-11-01 16:28:18 +00:00 · 2025-11-01 16:28:18 +00:00 · 52ff537459
commit 52ff537459
parent dd5a70ec71
1 changed files with 13 additions and 10 deletions
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@ -37,6 +37,12 @@ from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig,
    QuantizeMethodBase,
 )
 from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
    init_fp8_linear_kernel,
 )
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
    ScaledMMLinearQuantStrategy,
 )
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
    build_flashinfer_fp4_cutlass_moe_prepare_finalize,
@ -68,7 +74,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
    swizzle_blockscale,
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    Fp8LinearOp,
    requantize_with_max_scale,
 )
 from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
@ -254,8 +259,12 @@ class ModelOptFp8LinearMethod(LinearMethodBase):
    def __init__(self, quant_config: ModelOptFp8Config) -> None:
        self.quant_config = quant_config
-        self.fp8_linear = Fp8LinearOp(
+        self.fp8_linear = init_fp8_linear_kernel(
-            act_quant_static=True, act_quant_group_shape=GroupShape.PER_TENSOR
+            act_q_static=True,
            act_q_group_shape=GroupShape.PER_TENSOR,
            weight_quant_strategy=ScaledMMLinearQuantStrategy.TENSOR,
            out_dtype=None,
            module_name=self.__class__.__name__,
        )
    def create_weights(
@ -323,13 +332,7 @@ class ModelOptFp8LinearMethod(LinearMethodBase):
        x: torch.Tensor,
        bias: torch.Tensor | None = None,
    ) -> torch.Tensor:
-        return self.fp8_linear.apply(
+        return self.fp8_linear.apply_weights(layer, x, bias)
            input=x,
            weight=layer.weight,
            weight_scale=layer.weight_scale,
            input_scale=layer.input_scale,
            bias=bias,
        )
 class ModelOptFp8MoEMethod(FusedMoEMethodBase):