Simplify ep kernels installation (#19412)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2026-03-18 01:07:18 +08:00 · 2025-06-10 20:06:08 +08:00 · 2025-06-10 20:06:08 +08:00 · 64a9af5afa
commit 64a9af5afa
parent e4248849ec
5 changed files with 26 additions and 69 deletions
--- a/tools/ep_kernels/README.md
+++ b/tools/ep_kernels/README.md
@ -1,11 +1,10 @@
 Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package.

-Here we break down the requirements in 3 steps:
+Here we break down the requirements in 2 steps:
 1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
-2. Build and install the system libraries (GDR Copy). This step requires root access. You can do it inside a Docker container so that they can be shipped as a single image.
-3. Build and install the system drivers (GDR Copy, and necessary modifications to NVIDIA driver to enable IBGDA). This step requires root access, and must be done on the host machine.
+2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.

-2 and 3 are necessary for multi-node deployment.
+2 is necessary for multi-node deployment.

 All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.

@ -21,7 +20,6 @@ bash install_python_libraries.sh

 ```bash
 bash install_python_libraries.sh
-sudo bash install_system_libraries.sh
-sudo bash install_system_drivers.sh
+sudo bash configure_system_drivers.sh
 sudo reboot # Reboot is required to load the new driver
 ```
--- a/tools/ep_kernels/configure_system_drivers.sh
+++ b/tools/ep_kernels/configure_system_drivers.sh
@ -0,0 +1,7 @@
+set -ex
+
+# turn on IBGDA
+echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
+update-initramfs -u
+
+echo "Please reboot the system to apply the changes"
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@ -13,16 +13,6 @@ fi
 # install dependencies if not installed
 pip3 install cmake torch ninja

-# build gdrcopy, required by nvshmem
-pushd $WORKSPACE
-wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.4.4.tar.gz
-mkdir -p gdrcopy_src
-tar -xvf v2.4.4.tar.gz -C gdrcopy_src --strip-components=1
-pushd gdrcopy_src
-make -j$(nproc)
-make prefix=$WORKSPACE/gdrcopy_install install
-popd
-
 # build nvshmem
 pushd $WORKSPACE
 mkdir -p nvshmem_src
@ -34,26 +24,30 @@ git init
 git apply -vvv nvshmem.patch

 # assume CUDA_HOME is set correctly
-export GDRCOPY_HOME=$WORKSPACE/gdrcopy_install
+if [ -z "$CUDA_HOME" ]; then
+    echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
+    exit 1
+fi
+
+# disable all features except IBGDA
+export NVSHMEM_IBGDA_SUPPORT=1
+
 export NVSHMEM_SHMEM_SUPPORT=0
 export NVSHMEM_UCX_SUPPORT=0
 export NVSHMEM_USE_NCCL=0
-export NVSHMEM_IBGDA_SUPPORT=1
 export NVSHMEM_PMIX_SUPPORT=0
 export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
-export NVSHMEM_USE_GDRCOPY=1
-export NVSHMEM_IBRC_SUPPORT=1
-
-# remove MPI dependency
+export NVSHMEM_USE_GDRCOPY=0
+export NVSHMEM_IBRC_SUPPORT=0
 export NVSHMEM_BUILD_TESTS=0
 export NVSHMEM_BUILD_EXAMPLES=0
 export NVSHMEM_MPI_SUPPORT=0
+export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
+export NVSHMEM_BUILD_TXZ_PACKAGE=0
+export NVSHMEM_TIMEOUT_DEVICE_POLLING=0

-cmake -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
-
-cd $WORKSPACE/nvshmem_build/
-make -j$(nproc)
-make install
+cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
+cmake --build $WORKSPACE/nvshmem_build/ --target install

 popd

--- a/tools/ep_kernels/install_system_drivers.sh
+++ b/tools/ep_kernels/install_system_drivers.sh
@ -1,24 +0,0 @@
-set -ex
-
-# prepare workspace directory
-WORKSPACE=$1
-if [ -z "$WORKSPACE" ]; then
-    export WORKSPACE=$(pwd)/ep_kernels_workspace
-fi
-
-if [ ! -d "$WORKSPACE" ]; then
-    mkdir -p $WORKSPACE
-fi
-
-# build and install gdrcopy driver
-pushd $WORKSPACE
-cd gdrcopy_src
-./insmod.sh
-# run gdrcopy_copybw to test the installation
-$WORKSPACE/gdrcopy_install/bin/gdrcopy_copybw
-
-# turn on IBGDA
-echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
-update-initramfs -u
-
-echo "Please reboot the system to apply the changes"
--- a/tools/ep_kernels/install_system_libraries.sh
+++ b/tools/ep_kernels/install_system_libraries.sh
@ -1,18 +0,0 @@
-set -ex
-
-# prepare workspace directory
-WORKSPACE=$1
-if [ -z "$WORKSPACE" ]; then
-    export WORKSPACE=$(pwd)/ep_kernels_workspace
-fi
-
-if [ ! -d "$WORKSPACE" ]; then
-    mkdir -p $WORKSPACE
-fi
-
-# build and install gdrcopy system packages
-pushd $WORKSPACE
-cd gdrcopy_src/packages
-apt install devscripts -y
-CUDA=${CUDA_HOME:-/usr/local/cuda} ./build-deb-packages.sh
-dpkg -i *.deb