diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md index 273e0f378e34..85e9d2a4f812 100644 --- a/tools/ep_kernels/README.md +++ b/tools/ep_kernels/README.md @@ -13,16 +13,16 @@ All scripts accept a positional argument as workspace path for staging the build ## Usage -### Single-node - ```bash -bash install_python_libraries.sh +# for hopper +TORCH_CUDA_ARCH_LIST="9.0" bash install_python_libraries.sh +# for blackwell +TORCH_CUDA_ARCH_LIST="10.0" bash install_python_libraries.sh ``` -### Multi-node +Additional step for multi-node deployment: ```bash -bash install_python_libraries.sh sudo bash configure_system_drivers.sh sudo reboot # Reboot is required to load the new driver ``` diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh index 9d1b2da3b412..e163c83e8b51 100644 --- a/tools/ep_kernels/install_python_libraries.sh +++ b/tools/ep_kernels/install_python_libraries.sh @@ -29,6 +29,12 @@ if [ -z "$CUDA_HOME" ]; then exit 1 fi +# assume TORCH_CUDA_ARCH_LIST is set correctly +if [ -z "$TORCH_CUDA_ARCH_LIST" ]; then + echo "TORCH_CUDA_ARCH_LIST is not set, please set it to your desired architecture." + exit 1 +fi + # disable all features except IBGDA export NVSHMEM_IBGDA_SUPPORT=1 @@ -95,7 +101,7 @@ clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" cd pplx-kernels # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925 # PIP_NO_BUILD_ISOLATION=0 disables build isolation -PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install -vvv -e . +PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e . popd # build and install deepep, require pytorch installed