diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md index 5e273af05dc5d..795b0c77d610e 100644 --- a/docs/features/nixl_connector_usage.md +++ b/docs/features/nixl_connector_usage.md @@ -11,6 +11,12 @@ Install the NIXL library: `uv pip install nixl`, as a quick start. - Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions - The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files +For non-cuda platform, please install nixl with ucx build from source, instructed as below. + +```bash +python tools/install_nixl_from_source_ubuntu.py +``` + ### Transport Configuration NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables: diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index e44a914c726db..93ed383395f27 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -191,7 +191,7 @@ For production deployments requiring strict SLA guarantees for time-to-first-tok ### Setup Steps -1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](gh-file:tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip. +1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](gh-file:tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip. For non-cuda platform to install nixl with non-cuda UCX build, run the [install_nixl_from_source_ubuntu.py](gh-file:tools/install_nixl_from_source_ubuntu.py) script. 2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}`. Noted, you may also specify one or multiple NIXL_Backend. Such as: `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both", "kv_connector_extra_config":{"backends":["UCX", "GDS"]}}'` diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py new file mode 100644 index 0000000000000..9b5dfbb05a8af --- /dev/null +++ b/tools/install_nixl_from_source_ubuntu.py @@ -0,0 +1,210 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# install_prerequisites.py +import argparse +import glob +import os +import subprocess +import sys + +# --- Configuration --- +WHEELS_CACHE_HOME = os.environ.get("WHEELS_CACHE_HOME", "/tmp/wheels_cache") +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) +UCX_DIR = os.path.join('/tmp', 'ucx_source') +NIXL_DIR = os.path.join('/tmp', 'nixl_source') +UCX_INSTALL_DIR = os.path.join('/tmp', 'ucx_install') +UCX_REPO_URL = 'https://github.com/openucx/ucx.git' +NIXL_REPO_URL = 'https://github.com/ai-dynamo/nixl.git' + + +# --- Helper Functions --- +def run_command(command, cwd='.', env=None): + """Helper function to run a shell command and check for errors.""" + print(f"--> Running command: {' '.join(command)} in '{cwd}'", flush=True) + subprocess.check_call(command, cwd=cwd, env=env) + + +def is_pip_package_installed(package_name): + """Checks if a package is installed via pip without raising an exception.""" + result = subprocess.run( + [sys.executable, '-m', 'pip', 'show', package_name], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL) + return result.returncode == 0 + + +def find_nixl_wheel_in_cache(cache_dir): + """Finds a nixl wheel file in the specified cache directory.""" + # The repaired wheel will have a 'manylinux' tag, but this glob still works. + search_pattern = os.path.join(cache_dir, "nixl-*.whl") + wheels = glob.glob(search_pattern) + if wheels: + # Sort to get the most recent/highest version if multiple exist + wheels.sort() + return wheels[-1] + return None + + +def install_system_dependencies(): + """Installs required system packages using apt-get if run as root.""" + if os.geteuid() != 0: + print("\n---", flush=True) + print("WARNING: Not running as root. \ + Skipping system dependency installation.", + flush=True) + print( + "Please ensure the listed packages are installed on your system:", + flush=True) + print(" patchelf build-essential git cmake ninja-build \ + autotools-dev automake meson libtool libtool-bin", + flush=True) + print("---\n", flush=True) + return + + print("--- Running as root. Installing system dependencies... ---", + flush=True) + apt_packages = [ + "patchelf", # <-- Add patchelf here + "build-essential", + "git", + "cmake", + "ninja-build", + "autotools-dev", + "automake", + "meson", + "libtool", + "libtool-bin" + ] + run_command(['apt-get', 'update']) + run_command(['apt-get', 'install', '-y'] + apt_packages) + print("--- System dependencies installed successfully. ---\n", flush=True) + + +def build_and_install_prerequisites(args): + """Builds UCX and NIXL from source, creating a self-contained wheel.""" + + if not args.force_reinstall and is_pip_package_installed('nixl'): + print("--> NIXL is already installed. Nothing to do.", flush=True) + return + + cached_wheel = find_nixl_wheel_in_cache(WHEELS_CACHE_HOME) + if not args.force_reinstall and cached_wheel: + print(f"\n--> Found self-contained wheel: \ + {os.path.basename(cached_wheel)}.", + flush=True) + print("--> Installing from cache, skipping all source builds.", + flush=True) + install_command = [ + sys.executable, '-m', 'pip', 'install', cached_wheel + ] + run_command(install_command) + print("\n--- Installation from cache complete. ---", flush=True) + return + + print("\n--> No installed package or cached wheel found. \ + Starting full build process...", + flush=True) + print("\n--> Installing auditwheel...", flush=True) + run_command([sys.executable, '-m', 'pip', 'install', 'auditwheel']) + install_system_dependencies() + ucx_install_path = os.path.abspath(UCX_INSTALL_DIR) + print(f"--> Using wheel cache directory: {WHEELS_CACHE_HOME}", flush=True) + os.makedirs(WHEELS_CACHE_HOME, exist_ok=True) + + # -- Step 1: Build UCX from source -- + print("\n[1/3] Configuring and building UCX from source...", flush=True) + if not os.path.exists(UCX_DIR): + run_command(['git', 'clone', UCX_REPO_URL, UCX_DIR]) + ucx_source_path = os.path.abspath(UCX_DIR) + run_command(['git', 'checkout', 'v1.19.x'], cwd=ucx_source_path) + run_command(['./autogen.sh'], cwd=ucx_source_path) + configure_command = [ + './configure', + f'--prefix={ucx_install_path}', + '--enable-shared', + '--disable-static', + '--disable-doxygen-doc', + '--enable-optimizations', + '--enable-cma', + '--enable-devel-headers', + '--with-verbs', + '--enable-mt', + ] + run_command(configure_command, cwd=ucx_source_path) + run_command(['make', '-j', str(os.cpu_count() or 1)], cwd=ucx_source_path) + run_command(['make', 'install'], cwd=ucx_source_path) + print("--- UCX build and install complete ---", flush=True) + + # -- Step 2: Build NIXL wheel from source -- + print("\n[2/3] Building NIXL wheel from source...", flush=True) + if not os.path.exists(NIXL_DIR): + run_command(['git', 'clone', NIXL_REPO_URL, NIXL_DIR]) + + build_env = os.environ.copy() + build_env['PKG_CONFIG_PATH'] = os.path.join(ucx_install_path, 'lib', + 'pkgconfig') + ucx_lib_path = os.path.join(ucx_install_path, 'lib') + ucx_plugin_path = os.path.join(ucx_lib_path, 'ucx') + existing_ld_path = os.environ.get('LD_LIBRARY_PATH', '') + build_env['LD_LIBRARY_PATH'] = \ + f"{ucx_lib_path}:{ucx_plugin_path}:{existing_ld_path}".strip(':') + print(f"--> Using LD_LIBRARY_PATH: {build_env['LD_LIBRARY_PATH']}", + flush=True) + + temp_wheel_dir = os.path.join(ROOT_DIR, 'temp_wheelhouse') + run_command([ + sys.executable, '-m', 'pip', 'wheel', '.', '--no-deps', + f'--wheel-dir={temp_wheel_dir}' + ], + cwd=os.path.abspath(NIXL_DIR), + env=build_env) + + # -- Step 3: Repair the wheel by copying UCX libraries -- + print("\n[3/3] Repairing NIXL wheel to include UCX libraries...", + flush=True) + unrepaired_wheel = find_nixl_wheel_in_cache(temp_wheel_dir) + if not unrepaired_wheel: + raise RuntimeError("Failed to find the NIXL wheel after building it.") + + # We tell auditwheel to ignore the plugin that mesonpy already handled. + auditwheel_command = [ + 'auditwheel', + 'repair', + '--exclude', + 'libplugin_UCX.so', # <-- Exclude because mesonpy already includes it + unrepaired_wheel, + f'--wheel-dir={WHEELS_CACHE_HOME}' + ] + run_command(auditwheel_command, env=build_env) + + # --- CLEANUP --- + # No more temporary files to remove, just the temp wheelhouse + run_command(['rm', '-rf', temp_wheel_dir]) + # --- END CLEANUP --- + + newly_built_wheel = find_nixl_wheel_in_cache(WHEELS_CACHE_HOME) + if not newly_built_wheel: + raise RuntimeError("Failed to find the repaired NIXL wheel.") + + print(f"--> Successfully built self-contained wheel: \ + {os.path.basename(newly_built_wheel)}. Now installing...", + flush=True) + install_command = [ + sys.executable, '-m', 'pip', 'install', newly_built_wheel + ] + if args.force_reinstall: + install_command.insert(-1, '--force-reinstall') + + run_command(install_command) + print("--- NIXL installation complete ---", flush=True) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Build and install UCX and NIXL dependencies.") + parser.add_argument('--force-reinstall', + action='store_true', + help='Force rebuild and reinstall of UCX and NIXL \ + even if they are already installed.') + args = parser.parse_args() + build_and_install_prerequisites(args)