From 1a504aff6c79878238e535d78f322e6cae40b71a Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 18 Mar 2025 23:57:39 +0800 Subject: [PATCH] [Bugfix] Fix broken CPU quantization due to triton import (#15038) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/layers/quantization/gguf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index c92bcbea540a5..c8ab12d9a0aa2 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -10,7 +10,6 @@ from torch.nn.parameter import Parameter, UninitializedParameter from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, FusedMoEMethodBase) from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase @@ -140,6 +139,10 @@ def _fused_moe_gguf( qweight_type2: int, act, ) -> torch.Tensor: + # lazy import to avoid triggering triton import in CPU backend + from vllm.model_executor.layers.fused_moe.fused_moe import ( + moe_align_block_size) + out_hidden_states = torch.empty_like(x) if qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES: num_tokens, _ = x.shape