[Bugfix] Fix broken CPU quantization due to triton import (#15038)

Signed-off-by: Isotr0py <2037008807@qq.com>
2026-05-21 07:56:57 +08:00 · 2025-03-18 23:57:39 +08:00 · 2025-03-18 23:57:39 +08:00 · 1a504aff6c
commit 1a504aff6c
parent 01ca85bbd8
1 changed files with 4 additions and 1 deletions
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@ -10,7 +10,6 @@ from torch.nn.parameter import Parameter, UninitializedParameter
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                        FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
@ -140,6 +139,10 @@ def _fused_moe_gguf(
    qweight_type2: int,
    act,
 ) -> torch.Tensor:
+    # lazy import to avoid triggering triton import in CPU backend
+    from vllm.model_executor.layers.fused_moe.fused_moe import (
+        moe_align_block_size)
+
    out_hidden_states = torch.empty_like(x)
    if qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES:
        num_tokens, _ = x.shape