diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md
new file mode 100644
index 0000000000000..5c98e999da335
--- /dev/null
+++ b/tools/ep_kernels/README.md
@@ -0,0 +1,27 @@
+Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Technical Report](http://arxiv.org/abs/2412.19437), is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package.
+
+Here we break down the requirements in 3 steps:
+1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
+2. Build and install the system libraries (GDR Copy). This step requires root access. You can do it inside a Docker container so that they can be shipped as a single image.
+3. Build and install the system drivers (GDR Copy, and necessary modifications to NVIDIA driver to enable IBGDA). This step requires root access, and must be done on the host machine.
+
+2 and 3 are necessary for multi-node deployment.
+
+All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
+
+# Usage
+
+## Single-node
+
+```bash
+bash install_python_libraries.sh
+```
+
+## Multi-node
+
+```bash
+bash install_python_libraries.sh
+sudo bash install_system_libraries.sh
+sudo bash install_system_drivers.sh
+sudo reboot # Reboot is required to load the new driver
+```
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
new file mode 100644
index 0000000000000..e5632f4b58758
--- /dev/null
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -0,0 +1,77 @@
+set -ex
+
+# prepare workspace directory
+WORKSPACE=$1
+if [ -z "$WORKSPACE" ]; then
+    export WORKSPACE=$(pwd)/ep_kernels_workspace
+fi
+
+if [ ! -d "$WORKSPACE" ]; then
+    mkdir -p $WORKSPACE
+fi
+
+# install dependencies if not installed
+pip3 install cmake torch ninja
+
+# build gdrcopy, required by nvshmem
+pushd $WORKSPACE
+wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.4.4.tar.gz
+mkdir -p gdrcopy_src
+tar -xvf v2.4.4.tar.gz -C gdrcopy_src --strip-components=1
+pushd gdrcopy_src
+make -j$(nproc)
+make prefix=$WORKSPACE/gdrcopy_install install
+popd
+
+# build nvshmem
+pushd $WORKSPACE
+mkdir -p nvshmem_src
+wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
+tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1
+pushd nvshmem_src
+wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
+git init
+git apply -vvv nvshmem.patch
+
+# assume CUDA_HOME is set correctly
+export GDRCOPY_HOME=$WORKSPACE/gdrcopy_install
+export NVSHMEM_SHMEM_SUPPORT=0
+export NVSHMEM_UCX_SUPPORT=0
+export NVSHMEM_USE_NCCL=0
+export NVSHMEM_IBGDA_SUPPORT=1
+export NVSHMEM_PMIX_SUPPORT=0
+export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
+export NVSHMEM_USE_GDRCOPY=1
+export NVSHMEM_IBRC_SUPPORT=1
+
+# remove MPI dependency
+export NVSHMEM_BUILD_TESTS=0
+export NVSHMEM_BUILD_EXAMPLES=0
+export NVSHMEM_MPI_SUPPORT=0
+
+cmake -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
+
+cd $WORKSPACE/nvshmem_build/
+make -j$(nproc)
+make install
+
+popd
+
+export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
+
+# build and install pplx, require pytorch installed
+pushd $WORKSPACE
+git clone https://github.com/ppl-ai/pplx-kernels
+cd pplx-kernels
+# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
+# PIP_NO_BUILD_ISOLATION=0 disables build isolation
+PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install -vvv -e  .
+popd
+
+# build and install deepep, require pytorch installed
+pushd $WORKSPACE
+git clone https://github.com/deepseek-ai/DeepEP
+cd DeepEP
+export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
+PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e  .
+popd
diff --git a/tools/ep_kernels/install_system_drivers.sh b/tools/ep_kernels/install_system_drivers.sh
new file mode 100644
index 0000000000000..8b0669ef404ff
--- /dev/null
+++ b/tools/ep_kernels/install_system_drivers.sh
@@ -0,0 +1,24 @@
+set -ex
+
+# prepare workspace directory
+WORKSPACE=$1
+if [ -z "$WORKSPACE" ]; then
+    export WORKSPACE=$(pwd)/ep_kernels_workspace
+fi
+
+if [ ! -d "$WORKSPACE" ]; then
+    mkdir -p $WORKSPACE
+fi
+
+# build and install gdrcopy driver
+pushd $WORKSPACE
+cd gdrcopy_src
+./insmod.sh
+# run gdrcopy_copybw to test the installation
+$WORKSPACE/gdrcopy_install/bin/gdrcopy_copybw
+
+# turn on IBGDA
+echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1;"' | tee -a /etc/modprobe.d/nvidia.conf
+update-initramfs -u
+
+echo "Please reboot the system to apply the changes"
diff --git a/tools/ep_kernels/install_system_libraries.sh b/tools/ep_kernels/install_system_libraries.sh
new file mode 100644
index 0000000000000..c148d5443900a
--- /dev/null
+++ b/tools/ep_kernels/install_system_libraries.sh
@@ -0,0 +1,18 @@
+set -ex
+
+# prepare workspace directory
+WORKSPACE=$1
+if [ -z "$WORKSPACE" ]; then
+    export WORKSPACE=$(pwd)/ep_kernels_workspace
+fi
+
+if [ ! -d "$WORKSPACE" ]; then
+    mkdir -p $WORKSPACE
+fi
+
+# build and install gdrcopy system packages
+pushd $WORKSPACE
+cd gdrcopy_src/packages
+apt install devscripts -y
+CUDA=${CUDA_HOME:-/usr/local/cuda} ./build-deb-packages.sh
+dpkg -i *.deb