mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 00:15:51 +08:00
Simplify ep kernels installation (#19412)
Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
parent
e4248849ec
commit
64a9af5afa
@ -1,11 +1,10 @@
|
||||
Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package.
|
||||
|
||||
Here we break down the requirements in 3 steps:
|
||||
Here we break down the requirements in 2 steps:
|
||||
1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
|
||||
2. Build and install the system libraries (GDR Copy). This step requires root access. You can do it inside a Docker container so that they can be shipped as a single image.
|
||||
3. Build and install the system drivers (GDR Copy, and necessary modifications to NVIDIA driver to enable IBGDA). This step requires root access, and must be done on the host machine.
|
||||
2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
|
||||
|
||||
2 and 3 are necessary for multi-node deployment.
|
||||
2 is necessary for multi-node deployment.
|
||||
|
||||
All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
|
||||
|
||||
@ -21,7 +20,6 @@ bash install_python_libraries.sh
|
||||
|
||||
```bash
|
||||
bash install_python_libraries.sh
|
||||
sudo bash install_system_libraries.sh
|
||||
sudo bash install_system_drivers.sh
|
||||
sudo bash configure_system_drivers.sh
|
||||
sudo reboot # Reboot is required to load the new driver
|
||||
```
|
||||
|
||||
7
tools/ep_kernels/configure_system_drivers.sh
Normal file
7
tools/ep_kernels/configure_system_drivers.sh
Normal file
@ -0,0 +1,7 @@
|
||||
set -ex
|
||||
|
||||
# turn on IBGDA
|
||||
echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
|
||||
update-initramfs -u
|
||||
|
||||
echo "Please reboot the system to apply the changes"
|
||||
@ -13,16 +13,6 @@ fi
|
||||
# install dependencies if not installed
|
||||
pip3 install cmake torch ninja
|
||||
|
||||
# build gdrcopy, required by nvshmem
|
||||
pushd $WORKSPACE
|
||||
wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.4.4.tar.gz
|
||||
mkdir -p gdrcopy_src
|
||||
tar -xvf v2.4.4.tar.gz -C gdrcopy_src --strip-components=1
|
||||
pushd gdrcopy_src
|
||||
make -j$(nproc)
|
||||
make prefix=$WORKSPACE/gdrcopy_install install
|
||||
popd
|
||||
|
||||
# build nvshmem
|
||||
pushd $WORKSPACE
|
||||
mkdir -p nvshmem_src
|
||||
@ -34,26 +24,30 @@ git init
|
||||
git apply -vvv nvshmem.patch
|
||||
|
||||
# assume CUDA_HOME is set correctly
|
||||
export GDRCOPY_HOME=$WORKSPACE/gdrcopy_install
|
||||
if [ -z "$CUDA_HOME" ]; then
|
||||
echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# disable all features except IBGDA
|
||||
export NVSHMEM_IBGDA_SUPPORT=1
|
||||
|
||||
export NVSHMEM_SHMEM_SUPPORT=0
|
||||
export NVSHMEM_UCX_SUPPORT=0
|
||||
export NVSHMEM_USE_NCCL=0
|
||||
export NVSHMEM_IBGDA_SUPPORT=1
|
||||
export NVSHMEM_PMIX_SUPPORT=0
|
||||
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
|
||||
export NVSHMEM_USE_GDRCOPY=1
|
||||
export NVSHMEM_IBRC_SUPPORT=1
|
||||
|
||||
# remove MPI dependency
|
||||
export NVSHMEM_USE_GDRCOPY=0
|
||||
export NVSHMEM_IBRC_SUPPORT=0
|
||||
export NVSHMEM_BUILD_TESTS=0
|
||||
export NVSHMEM_BUILD_EXAMPLES=0
|
||||
export NVSHMEM_MPI_SUPPORT=0
|
||||
export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
|
||||
export NVSHMEM_BUILD_TXZ_PACKAGE=0
|
||||
export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
|
||||
|
||||
cmake -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
|
||||
|
||||
cd $WORKSPACE/nvshmem_build/
|
||||
make -j$(nproc)
|
||||
make install
|
||||
cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
|
||||
cmake --build $WORKSPACE/nvshmem_build/ --target install
|
||||
|
||||
popd
|
||||
|
||||
|
||||
@ -1,24 +0,0 @@
|
||||
set -ex
|
||||
|
||||
# prepare workspace directory
|
||||
WORKSPACE=$1
|
||||
if [ -z "$WORKSPACE" ]; then
|
||||
export WORKSPACE=$(pwd)/ep_kernels_workspace
|
||||
fi
|
||||
|
||||
if [ ! -d "$WORKSPACE" ]; then
|
||||
mkdir -p $WORKSPACE
|
||||
fi
|
||||
|
||||
# build and install gdrcopy driver
|
||||
pushd $WORKSPACE
|
||||
cd gdrcopy_src
|
||||
./insmod.sh
|
||||
# run gdrcopy_copybw to test the installation
|
||||
$WORKSPACE/gdrcopy_install/bin/gdrcopy_copybw
|
||||
|
||||
# turn on IBGDA
|
||||
echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
|
||||
update-initramfs -u
|
||||
|
||||
echo "Please reboot the system to apply the changes"
|
||||
@ -1,18 +0,0 @@
|
||||
set -ex
|
||||
|
||||
# prepare workspace directory
|
||||
WORKSPACE=$1
|
||||
if [ -z "$WORKSPACE" ]; then
|
||||
export WORKSPACE=$(pwd)/ep_kernels_workspace
|
||||
fi
|
||||
|
||||
if [ ! -d "$WORKSPACE" ]; then
|
||||
mkdir -p $WORKSPACE
|
||||
fi
|
||||
|
||||
# build and install gdrcopy system packages
|
||||
pushd $WORKSPACE
|
||||
cd gdrcopy_src/packages
|
||||
apt install devscripts -y
|
||||
CUDA=${CUDA_HOME:-/usr/local/cuda} ./build-deb-packages.sh
|
||||
dpkg -i *.deb
|
||||
Loading…
x
Reference in New Issue
Block a user