mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 01:45:02 +08:00
[Hardware][Intel CPU] Adding intel openmp tunings in Docker file (#6008)
Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
This commit is contained in:
parent
27902d42be
commit
81d7a50f24
@ -12,8 +12,10 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image
|
# Run the image
|
||||||
docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
|
||||||
docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
|
--cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
|
||||||
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
|
||||||
|
--cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
|
||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
docker exec cpu-test bash -c "python3 examples/offline_inference.py"
|
docker exec cpu-test bash -c "python3 examples/offline_inference.py"
|
||||||
|
|||||||
@ -6,7 +6,13 @@ RUN apt-get update -y \
|
|||||||
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
|
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
|
||||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
|
||||||
RUN echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc
|
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
||||||
|
# intel-openmp provides additional performance improvement vs. openmp
|
||||||
|
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
|
||||||
|
RUN pip install intel-openmp
|
||||||
|
|
||||||
|
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
|
||||||
|
|
||||||
|
|
||||||
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
|
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
|
||||||
|
|
||||||
@ -31,4 +37,4 @@ WORKDIR /workspace/
|
|||||||
|
|
||||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
|||||||
@ -398,6 +398,27 @@ def update_environment_variables(envs: Dict[str, str]):
|
|||||||
os.environ[k] = v
|
os.environ[k] = v
|
||||||
|
|
||||||
|
|
||||||
|
def init_kmp_env():
|
||||||
|
if not is_cpu():
|
||||||
|
return
|
||||||
|
|
||||||
|
ld_prealod_str = os.getenv("LD_PRELOAD", "")
|
||||||
|
if "libiomp5.so" not in ld_prealod_str:
|
||||||
|
return
|
||||||
|
|
||||||
|
# The time(milliseconds) that a thread should wait after completing the
|
||||||
|
# execution of a parallel region, before sleeping.
|
||||||
|
os.environ['KMP_BLOCKTIME'] = "1"
|
||||||
|
# dump settings on start up
|
||||||
|
os.environ['KMP_SETTINGS'] = "1"
|
||||||
|
# Prevents the CPU to run into low performance state
|
||||||
|
os.environ['KMP_TPAUSE'] = "0"
|
||||||
|
# Provides fine granularity parallelism
|
||||||
|
os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist"
|
||||||
|
os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist"
|
||||||
|
os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
|
||||||
|
|
||||||
|
|
||||||
def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]:
|
def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]:
|
||||||
"""Yield successive chunk_size chunks from lst."""
|
"""Yield successive chunk_size chunks from lst."""
|
||||||
return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
|
return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
|
||||||
|
|||||||
@ -13,7 +13,7 @@ from vllm.distributed import (ensure_model_parallel_initialized,
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor import set_random_seed
|
from vllm.model_executor import set_random_seed
|
||||||
from vllm.sequence import ExecuteModelRequest
|
from vllm.sequence import ExecuteModelRequest
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, init_kmp_env
|
||||||
from vllm.worker.cpu_model_runner import CPUModelRunner
|
from vllm.worker.cpu_model_runner import CPUModelRunner
|
||||||
from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
|
from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
|
||||||
LoraNotSupportedWorkerBase, WorkerInput)
|
LoraNotSupportedWorkerBase, WorkerInput)
|
||||||
@ -150,6 +150,9 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
|
|||||||
if self.is_driver_worker:
|
if self.is_driver_worker:
|
||||||
assert self.rank == 0, "The driver worker must have rank 0."
|
assert self.rank == 0, "The driver worker must have rank 0."
|
||||||
|
|
||||||
|
# try to initialize intel openmp optimized tunings
|
||||||
|
init_kmp_env()
|
||||||
|
|
||||||
if self.model_config.trust_remote_code:
|
if self.model_config.trust_remote_code:
|
||||||
# note: lazy import to avoid importing torch before initializing
|
# note: lazy import to avoid importing torch before initializing
|
||||||
from vllm.utils import init_cached_hf_modules
|
from vllm.utils import init_cached_hf_modules
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user