diff --git a/docker/Dockerfile b/docker/Dockerfile
index 709b79e84fbb..1b937bbc1225 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -85,7 +85,7 @@ ARG GET_PIP_URL
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo python3-pip \
+    && apt-get install -y ccache software-properties-common git curl sudo python3-pip libibverbs-dev \
     && curl -LsSf https://astral.sh/uv/install.sh | sh \
     && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
     && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
@@ -224,6 +224,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
+# Install DeepGEMM from source
+ARG DEEPGEMM_GIT_REF
+COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
+RUN --mount=type=cache,target=/root/.cache/uv \
+    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} --wheel-dir /tmp/deepgemm/dist
+
+# Ensure the wheel dir exists so later-stage COPY won't fail when DeepGEMM is skipped
+RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
+
+COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
+# Install EP kernels(pplx-kernels and DeepEP)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
+    /tmp/install_python_libraries.sh /tmp/ep_kernels_workspace wheel && \
+    find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
+
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
@@ -289,7 +305,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
+    && apt-get install -y software-properties-common curl sudo python3-pip \
     && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
         if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
@@ -356,36 +372,32 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 uv pip list
 
-# Even when we build Flashinfer with AOT mode, there's still
-# some issues w.r.t. JIT compilation. Therefore we need to
-# install build dependencies for JIT compilation.
-# TODO: Remove this once FlashInfer AOT wheel is fixed
-COPY requirements/build.txt requirements/build.txt
+# Install deepgemm wheel that has been built in the `build` stage
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt \
+    --mount=type=bind,from=build,source=/tmp/deepgemm/dist,target=/tmp/deepgemm/dist,ro \
+    sh -c 'if ls /tmp/deepgemm/dist/*.whl >/dev/null 2>&1; then \
+              uv pip install --system /tmp/deepgemm/dist/*.whl; \
+           else \
+              echo "No DeepGEMM wheels to install; skipping."; \
+           fi'
+
+# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH (https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/.ci/manywheel/build_cuda.sh#L141C14-L141C36)
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
+RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system ep_kernels/dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-# Install DeepGEMM from source
-ARG DEEPGEMM_GIT_REF
-COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
-RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
-
-COPY tools/install_gdrcopy.sh install_gdrcopy.sh
-RUN set -eux; \
+RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=/tmp/install_gdrcopy.sh,ro \
+    set -eux; \
     case "${TARGETPLATFORM}" in \
       linux/arm64) UUARCH="aarch64" ;; \
       linux/amd64) UUARCH="x64" ;; \
       *) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
     esac; \
-    ./install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"; \
-    rm ./install_gdrcopy.sh
-
-# Install EP kernels(pplx-kernels and DeepEP)
-COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
-ENV CUDA_HOME=/usr/local/cuda
-RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a 10.0a+PTX}" \
-    && bash install_python_libraries.sh
+    /tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"
 
 # CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
 # return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
@@ -415,6 +427,11 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y git
+
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
     CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
@@ -455,12 +472,11 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 
-COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
-
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
     if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
-        uv pip install --system -r requirements/kv_connectors.txt; \
+        uv pip install --system -r /tmp/kv_connectors.txt; \
     fi; \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         BITSANDBYTES_VERSION="0.42.0"; \
diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png
index f8c104ba1425..57a33524a516 100644
Binary files a/docs/assets/contributing/dockerfile-stages-dependency.png and b/docs/assets/contributing/dockerfile-stages-dependency.png differ
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 5ea543f4cb1e..77af3f68a050 100755
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -1,94 +1,79 @@
 #!/usr/bin/env bash
 set -ex
 
-# prepare workspace directory
-WORKSPACE=$1
-if [ -z "$WORKSPACE" ]; then
-    export WORKSPACE=$(pwd)/ep_kernels_workspace
-fi
+# usage: ./build.sh [workspace_dir] [mode]
+#   mode: "install" (default) → install directly into current Python env
+#         "wheel"              → build wheels into WORKSPACE/dist
 
-if [ ! -d "$WORKSPACE" ]; then
-    mkdir -p $WORKSPACE
-fi
+WORKSPACE=${1:-$(pwd)/ep_kernels_workspace}
+MODE=${2:-install}
+mkdir -p "$WORKSPACE"
+
+WHEEL_DIR="$WORKSPACE/dist"
+mkdir -p "$WHEEL_DIR"
+NVSHMEM_VER=3.3.9
+
+pushd "$WORKSPACE"
 
-# configurable pip command (default: pip3)
-PIP_CMD=${PIP_CMD:-pip3}
 CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
 
 # install dependencies if not installed
-$PIP_CMD install cmake torch ninja
-
-# build nvshmem
-pushd $WORKSPACE
-mkdir -p nvshmem_src
-wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
-tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1
-pushd nvshmem_src
-wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
-git init
-git apply -vvv nvshmem.patch
-
-# assume CUDA_HOME is set correctly
-if [ -z "$CUDA_HOME" ]; then
-    echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
-    exit 1
+if [ -z "$VIRTUAL_ENV" ]; then
+  uv pip install --system cmake torch ninja
+else
+  uv pip install cmake torch ninja
 fi
 
-# assume TORCH_CUDA_ARCH_LIST is set correctly
-if [ -z "$TORCH_CUDA_ARCH_LIST" ]; then
-    echo "TORCH_CUDA_ARCH_LIST is not set, please set it to your desired architecture."
+# fetch nvshmem
+ARCH=$(uname -m)
+case "${ARCH,,}" in
+  x86_64|amd64)
+    NVSHMEM_SUBDIR="linux-x86_64"
+    NVSHMEM_FILE="libnvshmem-linux-x86_64-${NVSHMEM_VER}_cuda12-archive.tar.xz"
+    ;;
+  aarch64|arm64)
+    NVSHMEM_SUBDIR="linux-sbsa"
+    NVSHMEM_FILE="libnvshmem-linux-sbsa-${NVSHMEM_VER}_cuda12-archive.tar.xz"
+    ;;
+  *)
+    echo "Unsupported architecture: ${ARCH}" >&2
     exit 1
-fi
+    ;;
+esac
 
-# disable all features except IBGDA
-export NVSHMEM_IBGDA_SUPPORT=1
-
-export NVSHMEM_SHMEM_SUPPORT=0
-export NVSHMEM_UCX_SUPPORT=0
-export NVSHMEM_USE_NCCL=0
-export NVSHMEM_PMIX_SUPPORT=0
-export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
-export NVSHMEM_USE_GDRCOPY=0
-export NVSHMEM_IBRC_SUPPORT=0
-export NVSHMEM_BUILD_TESTS=0
-export NVSHMEM_BUILD_EXAMPLES=0
-export NVSHMEM_MPI_SUPPORT=0
-export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
-export NVSHMEM_BUILD_TXZ_PACKAGE=0
-export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
-
-cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
-cmake --build $WORKSPACE/nvshmem_build/ --target install
+NVSHMEM_URL="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/${NVSHMEM_SUBDIR}/${NVSHMEM_FILE}"
 
+pushd "$WORKSPACE"
+echo "Downloading NVSHMEM ${NVSHMEM_VER} for ${NVSHMEM_SUBDIR} ..."
+curl -fSL "${NVSHMEM_URL}" -o "${NVSHMEM_FILE}"
+tar -xf "${NVSHMEM_FILE}"
+mv "${NVSHMEM_FILE%.tar.xz}" nvshmem
+rm -f "${NVSHMEM_FILE}"
+rm -rf nvshmem/lib/bin nvshmem/lib/share
 popd
 
-export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
+export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem/lib/cmake:$CMAKE_PREFIX_PATH
 
 is_git_dirty() {
     local dir=$1
     pushd "$dir" > /dev/null
-
-    if [ -d ".git" ] && [ -n "$(git status --porcelain 2>/dev/null)" ]; then
+    if [ -d ".git" ] && [ -n "$(git status --porcelain 3>/dev/null)" ]; then
         popd > /dev/null
-        return 0  # dirty (true)
+        return 0
     else
         popd > /dev/null
-        return 1  # clean (false)
+        return 1
     fi
 }
 
-# Function to handle git repository cloning with dirty/incomplete checks
 clone_repo() {
     local repo_url=$1
     local dir_name=$2
     local key_file=$3
     local commit_hash=$4
-
     if [ -d "$dir_name" ]; then
-        # Check if directory has uncommitted changes (dirty)
         if is_git_dirty "$dir_name"; then
             echo "$dir_name directory is dirty, skipping clone"
-        # Check if clone failed (directory exists but not a valid git repo or missing key files)
         elif [ ! -d "$dir_name/.git" ] || [ ! -f "$dir_name/$key_file" ]; then
             echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
             rm -rf "$dir_name"
@@ -99,7 +84,7 @@ clone_repo() {
                 cd ..
             fi
         else
-            echo "$dir_name directory exists and appears complete; manually update if needed"
+            echo "$dir_name directory exists and appears complete"
         fi
     else
         git clone "$repo_url"
@@ -111,17 +96,44 @@ clone_repo() {
     fi
 }
 
-# build and install pplx, require pytorch installed
-pushd $WORKSPACE
-clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf"
-cd pplx-kernels
-$PIP_CMD install --no-build-isolation -vvv -e .
-popd
+do_build() {
+    local repo=$1
+    local name=$2
+    local key=$3
+    local commit=$4
+    local extra_env=$5
 
-# build and install deepep, require pytorch installed
-pushd $WORKSPACE
-clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "73b6ea4"
-cd DeepEP
-export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
-$PIP_CMD install --no-build-isolation -vvv -e .
-popd
+    pushd "$WORKSPACE"
+    clone_repo "$repo" "$name" "$key" "$commit"
+    cd "$name"
+
+    if [ "$MODE" = "install" ]; then
+        echo "Installing $name into environment"
+        eval "$extra_env" uv pip install --no-build-isolation -vvv .
+    else
+        echo "Building $name wheel into $WHEEL_DIR"
+        eval "$extra_env" uv build --wheel --no-build-isolation -vvv --out-dir "$WHEEL_DIR" .
+    fi
+    popd
+}
+
+# build pplx-kernels
+do_build \
+    "https://github.com/ppl-ai/pplx-kernels" \
+    "pplx-kernels" \
+    "setup.py" \
+    "12cecfd" \
+    ""
+
+# build DeepEP
+do_build \
+    "https://github.com/deepseek-ai/DeepEP" \
+    "DeepEP" \
+    "setup.py" \
+    "73b6ea4" \
+    "export NVSHMEM_DIR=$WORKSPACE/nvshmem; "
+
+if [ "$MODE" = "wheel" ]; then
+    echo "All wheels written to $WHEEL_DIR"
+    ls -l "$WHEEL_DIR"
+fi
diff --git a/tools/install_deepgemm.sh b/tools/install_deepgemm.sh
index 4f2cd302c3ef..ee9a5dd4aa64 100755
--- a/tools/install_deepgemm.sh
+++ b/tools/install_deepgemm.sh
@@ -1,12 +1,13 @@
 #!/bin/bash
-# Script to install DeepGEMM from source
-# This script can be used both in Docker builds and by users locally
-
+# Script to build and/or install DeepGEMM from source
+# Default: build and install immediately
+# Optional: build wheels to a directory for later installation (useful in multi-stage builds)
 set -e
 
 # Default values
 DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
 DEEPGEMM_GIT_REF="594953acce41793ae00a1233eb516044d604bcb6"
+WHEEL_DIR=""
 
 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
@@ -27,11 +28,20 @@ while [[ $# -gt 0 ]]; do
             CUDA_VERSION="$2"
             shift 2
             ;;
+        --wheel-dir)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --wheel-dir requires a directory path." >&2
+                exit 1
+            fi
+            WHEEL_DIR="$2"
+            shift 2
+            ;;
         -h|--help)
             echo "Usage: $0 [OPTIONS]"
             echo "Options:"
             echo "  --ref REF          Git reference to checkout (default: $DEEPGEMM_GIT_REF)"
             echo "  --cuda-version VER CUDA version (auto-detected if not provided)"
+            echo "  --wheel-dir PATH   If set, build wheel into PATH but do not install"
             echo "  -h, --help         Show this help message"
             exit 0
             ;;
@@ -57,16 +67,15 @@ fi
 CUDA_MAJOR="${CUDA_VERSION%%.*}"
 CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
 CUDA_MINOR="${CUDA_MINOR%%.*}"
-
 echo "CUDA version: $CUDA_VERSION (major: $CUDA_MAJOR, minor: $CUDA_MINOR)"
 
 # Check CUDA version requirement
 if [ "$CUDA_MAJOR" -lt 12 ] || { [ "$CUDA_MAJOR" -eq 12 ] && [ "$CUDA_MINOR" -lt 8 ]; }; then
-    echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
+    echo "Skipping DeepGEMM build/installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
     exit 0
 fi
 
-echo "Installing DeepGEMM from source..."
+echo "Preparing DeepGEMM build..."
 echo "Repository: $DEEPGEMM_GIT_REPO"
 echo "Reference: $DEEPGEMM_GIT_REF"
 
@@ -76,23 +85,31 @@ trap 'rm -rf "$INSTALL_DIR"' EXIT
 
 # Clone the repository
 git clone --recursive --shallow-submodules "$DEEPGEMM_GIT_REPO" "$INSTALL_DIR/deepgemm"
-
-echo "🏗️  Building DeepGEMM"
 pushd "$INSTALL_DIR/deepgemm"
 
 # Checkout the specific reference
 git checkout "$DEEPGEMM_GIT_REF"
 
-# Build DeepGEMM
+# Clean previous build artifacts
 # (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
-rm -rf build dist
-rm -rf *.egg-info
+rm -rf build dist *.egg-info
+
+# Build wheel
+echo "🏗️  Building DeepGEMM wheel..."
 python3 setup.py bdist_wheel
 
-# Install the wheel
+# If --wheel-dir was specified, copy wheels there and exit
+if [ -n "$WHEEL_DIR" ]; then
+    mkdir -p "$WHEEL_DIR"
+    cp dist/*.whl "$WHEEL_DIR"/
+    echo "✅ Wheel built and copied to $WHEEL_DIR"
+    popd
+    exit 0
+fi
+
+# Default behaviour: install built wheel
 if command -v uv >/dev/null 2>&1; then
     echo "Installing DeepGEMM wheel using uv..."
-    # Use --system in Docker contexts, respect user's environment otherwise
     if [ -n "$VLLM_DOCKER_BUILD_CONTEXT" ]; then
         uv pip install --system dist/*.whl
     else
@@ -104,5 +121,4 @@ else
 fi
 
 popd
-
 echo "✅ DeepGEMM installation completed successfully"