diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md new file mode 100644 index 0000000000000..5c98e999da335 --- /dev/null +++ b/tools/ep_kernels/README.md @@ -0,0 +1,27 @@ +Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package. + +Here we break down the requirements in 3 steps: +1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this. +2. Build and install the system libraries (GDR Copy). This step requires root access. You can do it inside a Docker container so that they can be shipped as a single image. +3. Build and install the system drivers (GDR Copy, and necessary modifications to NVIDIA driver to enable IBGDA). This step requires root access, and must be done on the host machine. + +2 and 3 are necessary for multi-node deployment. + +All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`. + +# Usage + +## Single-node + +```bash +bash install_python_libraries.sh +``` + +## Multi-node + +```bash +bash install_python_libraries.sh +sudo bash install_system_libraries.sh +sudo bash install_system_drivers.sh +sudo reboot # Reboot is required to load the new driver +``` diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh new file mode 100644 index 0000000000000..e5632f4b58758 --- /dev/null +++ b/tools/ep_kernels/install_python_libraries.sh @@ -0,0 +1,77 @@ +set -ex + +# prepare workspace directory +WORKSPACE=$1 +if [ -z "$WORKSPACE" ]; then + export WORKSPACE=$(pwd)/ep_kernels_workspace +fi + +if [ ! -d "$WORKSPACE" ]; then + mkdir -p $WORKSPACE +fi + +# install dependencies if not installed +pip3 install cmake torch ninja + +# build gdrcopy, required by nvshmem +pushd $WORKSPACE +wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.4.4.tar.gz +mkdir -p gdrcopy_src +tar -xvf v2.4.4.tar.gz -C gdrcopy_src --strip-components=1 +pushd gdrcopy_src +make -j$(nproc) +make prefix=$WORKSPACE/gdrcopy_install install +popd + +# build nvshmem +pushd $WORKSPACE +mkdir -p nvshmem_src +wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz +tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1 +pushd nvshmem_src +wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch +git init +git apply -vvv nvshmem.patch + +# assume CUDA_HOME is set correctly +export GDRCOPY_HOME=$WORKSPACE/gdrcopy_install +export NVSHMEM_SHMEM_SUPPORT=0 +export NVSHMEM_UCX_SUPPORT=0 +export NVSHMEM_USE_NCCL=0 +export NVSHMEM_IBGDA_SUPPORT=1 +export NVSHMEM_PMIX_SUPPORT=0 +export NVSHMEM_TIMEOUT_DEVICE_POLLING=0 +export NVSHMEM_USE_GDRCOPY=1 +export NVSHMEM_IBRC_SUPPORT=1 + +# remove MPI dependency +export NVSHMEM_BUILD_TESTS=0 +export NVSHMEM_BUILD_EXAMPLES=0 +export NVSHMEM_MPI_SUPPORT=0 + +cmake -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install + +cd $WORKSPACE/nvshmem_build/ +make -j$(nproc) +make install + +popd + +export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH + +# build and install pplx, require pytorch installed +pushd $WORKSPACE +git clone https://github.com/ppl-ai/pplx-kernels +cd pplx-kernels +# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925 +# PIP_NO_BUILD_ISOLATION=0 disables build isolation +PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install -vvv -e . +popd + +# build and install deepep, require pytorch installed +pushd $WORKSPACE +git clone https://github.com/deepseek-ai/DeepEP +cd DeepEP +export NVSHMEM_DIR=$WORKSPACE/nvshmem_install +PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e . +popd diff --git a/tools/ep_kernels/install_system_drivers.sh b/tools/ep_kernels/install_system_drivers.sh new file mode 100644 index 0000000000000..8b0669ef404ff --- /dev/null +++ b/tools/ep_kernels/install_system_drivers.sh @@ -0,0 +1,24 @@ +set -ex + +# prepare workspace directory +WORKSPACE=$1 +if [ -z "$WORKSPACE" ]; then + export WORKSPACE=$(pwd)/ep_kernels_workspace +fi + +if [ ! -d "$WORKSPACE" ]; then + mkdir -p $WORKSPACE +fi + +# build and install gdrcopy driver +pushd $WORKSPACE +cd gdrcopy_src +./insmod.sh +# run gdrcopy_copybw to test the installation +$WORKSPACE/gdrcopy_install/bin/gdrcopy_copybw + +# turn on IBGDA +echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf +update-initramfs -u + +echo "Please reboot the system to apply the changes" diff --git a/tools/ep_kernels/install_system_libraries.sh b/tools/ep_kernels/install_system_libraries.sh new file mode 100644 index 0000000000000..c148d5443900a --- /dev/null +++ b/tools/ep_kernels/install_system_libraries.sh @@ -0,0 +1,18 @@ +set -ex + +# prepare workspace directory +WORKSPACE=$1 +if [ -z "$WORKSPACE" ]; then + export WORKSPACE=$(pwd)/ep_kernels_workspace +fi + +if [ ! -d "$WORKSPACE" ]; then + mkdir -p $WORKSPACE +fi + +# build and install gdrcopy system packages +pushd $WORKSPACE +cd gdrcopy_src/packages +apt install devscripts -y +CUDA=${CUDA_HOME:-/usr/local/cuda} ./build-deb-packages.sh +dpkg -i *.deb