From 1a504aff6c79878238e535d78f322e6cae40b71a Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 18 Mar 2025 23:57:39 +0800
Subject: [PATCH] [Bugfix] Fix broken CPU quantization due to triton import
 (#15038)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/layers/quantization/gguf.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index c92bcbea540a5..c8ab12d9a0aa2 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -10,7 +10,6 @@ from torch.nn.parameter import Parameter, UninitializedParameter
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                         FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
@@ -140,6 +139,10 @@ def _fused_moe_gguf(
     qweight_type2: int,
     act,
 ) -> torch.Tensor:
+    # lazy import to avoid triggering triton import in CPU backend
+    from vllm.model_executor.layers.fused_moe.fused_moe import (
+        moe_align_block_size)
+
     out_hidden_states = torch.empty_like(x)
     if qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES:
         num_tokens, _ = x.shape