mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-06 01:57:02 +08:00
[Bugfix] Fix broken CPU quantization due to triton import (#15038)
Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
parent
01ca85bbd8
commit
1a504aff6c
@ -10,7 +10,6 @@ from torch.nn.parameter import Parameter, UninitializedParameter
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size
|
||||
from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
|
||||
FusedMoEMethodBase)
|
||||
from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
|
||||
@ -140,6 +139,10 @@ def _fused_moe_gguf(
|
||||
qweight_type2: int,
|
||||
act,
|
||||
) -> torch.Tensor:
|
||||
# lazy import to avoid triggering triton import in CPU backend
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||
moe_align_block_size)
|
||||
|
||||
out_hidden_states = torch.empty_like(x)
|
||||
if qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES:
|
||||
num_tokens, _ = x.shape
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user