mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-07 23:02:15 +08:00
[CPU] Upgrade CPU backend to torch-2.6 (#13381)
Signed-off-by: jiang1.li <jiang1.li@intel.com> Co-authored-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
parent
debd6bbf09
commit
ff47aab056
@ -19,13 +19,14 @@ remove_docker_container
|
|||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
||||||
|
|
||||||
function cpu_tests() {
|
function cpu_tests() {
|
||||||
set -e
|
set -e
|
||||||
export NUMA_NODE=$2
|
export NUMA_NODE=$2
|
||||||
|
export BUILDKITE_BUILD_NUMBER=$3
|
||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||||
@ -36,6 +37,7 @@ function cpu_tests() {
|
|||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pip install -r vllm/requirements/test.txt
|
pip install -r vllm/requirements/test.txt
|
||||||
|
pip install -r vllm/requirements/cpu.txt
|
||||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||||
@ -85,4 +87,4 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
|
||||||
|
|||||||
@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
|
|||||||
|
|
||||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||||
|
|
||||||
RUN pip install intel_extension_for_pytorch==2.5.0
|
RUN pip install intel_extension_for_pytorch==2.6.0
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
|||||||
@ -149,7 +149,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
oneDNN
|
oneDNN
|
||||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||||
GIT_TAG v3.6
|
GIT_TAG v3.7.1
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
GIT_SHALLOW TRUE
|
GIT_SHALLOW TRUE
|
||||||
)
|
)
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
-r common.txt
|
-r common.txt
|
||||||
|
|
||||||
# Dependencies for CPUs
|
# Dependencies for CPUs
|
||||||
torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" and platform_machine != "s390x"
|
torch==2.6.0+cpu; platform_machine == "x86_64"
|
||||||
torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
|
torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
|
||||||
torch==2.7.0.dev20250304; platform_machine == "s390x"
|
torch==2.7.0.dev20250304; platform_machine == "s390x"
|
||||||
|
|
||||||
|
|||||||
@ -12,7 +12,7 @@ from vllm.lora.request import LoRARequest
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=not current_platform.is_cpu())
|
||||||
def v1(run_with_both_engines_lora):
|
def v1(run_with_both_engines_lora):
|
||||||
# Simple autouse wrapper to run both engines for each test
|
# Simple autouse wrapper to run both engines for each test
|
||||||
# This can be promoted up to conftest.py to run for every
|
# This can be promoted up to conftest.py to run for every
|
||||||
|
|||||||
@ -17,7 +17,7 @@ class _PagedAttention:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_supported_head_sizes() -> List[int]:
|
def get_supported_head_sizes() -> List[int]:
|
||||||
return [32, 64, 80, 96, 112, 128, 256]
|
return [32, 64, 80, 96, 112, 128, 192, 256]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_kv_cache_shape(
|
def get_kv_cache_shape(
|
||||||
|
|||||||
@ -254,10 +254,11 @@ def _run_worker_process(
|
|||||||
# online (in situ) tuning is enabled.
|
# online (in situ) tuning is enabled.
|
||||||
# Offline tuning API (record_untuned_is_enabled()) only
|
# Offline tuning API (record_untuned_is_enabled()) only
|
||||||
# available in PyTorch 2.6 or later.
|
# available in PyTorch 2.6 or later.
|
||||||
import torch.cuda.tunable as tunable
|
if torch.cuda.is_available():
|
||||||
if (tunable.is_enabled() and tunable.tuning_is_enabled()
|
import torch.cuda.tunable as tunable
|
||||||
and not tunable.record_untuned_is_enabled()):
|
if (tunable.is_enabled() and tunable.tuning_is_enabled()
|
||||||
tunable.write_file()
|
and not tunable.record_untuned_is_enabled()):
|
||||||
|
tunable.write_file()
|
||||||
|
|
||||||
logger.info("Worker exiting")
|
logger.info("Worker exiting")
|
||||||
|
|
||||||
|
|||||||
@ -193,10 +193,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|||||||
global_num_experts: int = -1,
|
global_num_experts: int = -1,
|
||||||
expert_map: Optional[torch.Tensor] = None,
|
expert_map: Optional[torch.Tensor] = None,
|
||||||
custom_routing_function: Optional[Callable] = None,
|
custom_routing_function: Optional[Callable] = None,
|
||||||
|
scoring_func: str = "softmax",
|
||||||
|
e_score_correction_bias: Optional[torch.Tensor] = None,
|
||||||
activation: str = "silu",
|
activation: str = "silu",
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
assert custom_routing_function is None
|
|
||||||
assert activation == "silu", f"{activation} is not supported."
|
assert activation == "silu", f"{activation} is not supported."
|
||||||
return layer.ipex_fusion(
|
return layer.ipex_fusion(
|
||||||
x,
|
x,
|
||||||
@ -206,6 +207,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|||||||
renormalize,
|
renormalize,
|
||||||
topk_group,
|
topk_group,
|
||||||
num_expert_group,
|
num_expert_group,
|
||||||
|
custom_routing_function,
|
||||||
|
scoring_func,
|
||||||
|
e_score_correction_bias,
|
||||||
)
|
)
|
||||||
|
|
||||||
def forward_tpu(
|
def forward_tpu(
|
||||||
|
|||||||
@ -121,6 +121,9 @@ class CpuPlatform(Platform):
|
|||||||
# Disable torch async compiling which won't work with daemonic processes
|
# Disable torch async compiling which won't work with daemonic processes
|
||||||
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
|
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
|
||||||
|
|
||||||
|
# MLA attention is not supported
|
||||||
|
os.environ["VLLM_MLA_DISABLE"] = "1"
|
||||||
|
|
||||||
# Intel OpenMP setting
|
# Intel OpenMP setting
|
||||||
ld_prealod_str = os.getenv("LD_PRELOAD", "")
|
ld_prealod_str = os.getenv("LD_PRELOAD", "")
|
||||||
if "libiomp5.so" in ld_prealod_str:
|
if "libiomp5.so" in ld_prealod_str:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user