diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md index 9ca25e4709e86..43c9187f072e1 100644 --- a/docs/source/getting_started/installation/cpu.md +++ b/docs/source/getting_started/installation/cpu.md @@ -195,6 +195,7 @@ vLLM CPU backend supports the following vLLM features: - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. +- `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False). ## Performance tips diff --git a/vllm/envs.py b/vllm/envs.py index 24ee4583c75d8..259501056cc3b 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -40,6 +40,7 @@ if TYPE_CHECKING: VLLM_PP_LAYER_PARTITION: Optional[str] = None VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_CPU_OMP_THREADS_BIND: str = "" + VLLM_CPU_MOE_PREPACK: bool = True VLLM_OPENVINO_DEVICE: str = "CPU" VLLM_OPENVINO_KVCACHE_SPACE: int = 0 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None @@ -349,6 +350,12 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_CPU_OMP_THREADS_BIND": lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"), + # (CPU backend only) whether to use prepack for MoE layer. This will be + # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might + # need to set this to "0" (False). + "VLLM_CPU_MOE_PREPACK": + lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))), + # OpenVINO device selection # default is CPU "VLLM_OPENVINO_DEVICE": diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 2c5fa509c595d..917643134645f 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -7,6 +7,7 @@ from typing import Callable, List, Optional, Tuple import torch from torch.nn.parameter import UninitializedParameter +from vllm import envs from vllm.config import get_current_vllm_config from vllm.distributed import (get_dp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -104,7 +105,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( layer.w13_weight, layer.w2_weight, - use_prepack=True, + use_prepack=envs.VLLM_CPU_MOE_PREPACK, ) else: raise NotImplementedError("CPU MOE only supports x86 arch.")