[core] add nccl symmetric memory for all reduce (#24532)

Signed-off-by: Amir Samani <asamani@nvidia.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-03-16 16:27:15 +08:00 · 2025-09-23 11:33:06 -07:00 · 2025-09-23 11:33:06 -07:00 · 8c1c81a3de
commit 8c1c81a3de
parent a3a7828010
12 changed files with 489 additions and 6 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -1039,3 +1039,4 @@ steps:
  num_gpus: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
+    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@ -7,6 +7,10 @@ Benchmark script for device communicators:
 CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
 and SymmMemCommunicator (multimem, two-shot).

+for NCCL symmetric memory you need to set the environment variables
+NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
+not use fast NVLS implementation for all reduce.
+
 Usage:
    torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]

@ -26,7 +30,13 @@ import torch.distributed as dist
 from torch.distributed import ProcessGroup

 from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
-from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.device_communicators.pynccl import (
+    PyNcclCommunicator,
+    register_nccl_symmetric_ops,
+)
+from vllm.distributed.device_communicators.pynccl_allocator import (
+    set_graph_pool_id,
+)
 from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 from vllm.logger import init_logger
 from vllm.utils import FlexibleArgumentParser
@ -98,6 +108,7 @@ class CommunicatorBenchmark:
            )
            if not self.pynccl_comm.disabled:
                logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
+                register_nccl_symmetric_ops(self.pynccl_comm)
            else:
                logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
                self.pynccl_comm = None
@ -194,6 +205,15 @@ class CommunicatorBenchmark:
                    None,  # no env variable needed
                )
            )
+            communicators.append(
+                (
+                    "pynccl-symm",
+                    lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
+                    lambda t: True,  # Always available if initialized
+                    nullcontext(),
+                    None,  # no env variable needed
+                )
+            )

        if self.symm_mem_comm_multimem is not None:
            comm = self.symm_mem_comm_multimem
@ -271,7 +291,9 @@ class CommunicatorBenchmark:
                # Capture the graph using context manager
                with context:
                    graph = torch.cuda.CUDAGraph()
-                    with torch.cuda.graph(graph):
+                    graph_pool = torch.cuda.graph_pool_handle()
+                    set_graph_pool_id(graph_pool)
+                    with torch.cuda.graph(graph, pool=graph_pool):
                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                            allreduce_fn(graph_input)

--- a/tests/distributed/test_nccl_symm_mem_allreduce.py
+++ b/tests/distributed/test_nccl_symm_mem_allreduce.py
@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+import typing
+
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+import vllm.envs as envs
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.distributed.device_communicators.cuda_communicator import (
+    CudaCommunicator)
+from vllm.distributed.device_communicators.pynccl import (
+    register_nccl_symmetric_ops)
+from vllm.distributed.device_communicators.pynccl_allocator import (
+    get_nccl_mem_pool, is_symmetric_memory_enabled)
+from vllm.distributed.parallel_state import (get_tp_group,
+                                             init_distributed_environment,
+                                             initialize_model_parallel)
+from vllm.platforms import current_platform
+from vllm.utils import update_environment_variables
+
+torch.manual_seed(42)
+random.seed(44)
+
+test_size_elements = 4 * 1024 * 1024
+
+
+def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
+    monkeypatch = pytest.MonkeyPatch()
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        dtype = torch.bfloat16
+        device = torch.device(f"cuda:{local_rank}")
+        torch.cuda.set_device(device)
+        torch.set_default_device(device)
+        torch.set_default_dtype(dtype)
+        update_environment_variables({
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12345",
+        })
+
+        init_distributed_environment()
+        initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+        cuda_communicator = typing.cast(CudaCommunicator,
+                                        get_tp_group().device_communicator)
+        pynccl_comm = cuda_communicator.pynccl_comm
+        if get_nccl_mem_pool() is None:
+            pytest.skip("NCCL allocator compilation failed "
+                        "(probably missing NCCL headers).")
+        if not is_symmetric_memory_enabled():
+            pytest.skip("NCCL symmetric memory allreduce is disabled.")
+
+        register_nccl_symmetric_ops(pynccl_comm)
+        input = torch.randint(1,
+                              23, (test_size_elements, ),
+                              dtype=dtype,
+                              device=device)
+        input_clone = input.clone()
+        output = torch.ops.vllm.all_reduce_symmetric_with_copy(input)
+        assert output is not None
+
+        group = get_tp_group().device_group
+        dist.all_reduce(input_clone, group=group)
+        torch.testing.assert_close(output, input_clone, atol=2.5, rtol=0.1)
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="NCCLSymmMemAllreduce is only available for CUDA platforms.",
+)
+@pytest.mark.parametrize("world_size", [2])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
+                    reason="Only test on CUDA")
+def test_nccl_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, world_size):
+    if world_size > torch.cuda.device_count():
+        pytest.skip("Not enough GPUs to run the test.")
+
+    # Enable SymmMemCommunicator
+    monkeypatch.setenv("VLLM_USE_NCCL_SYMM_MEM", "1")
+    monkeypatch.setenv("NCCL_NVLS_ENABLE", "1")
+    monkeypatch.setenv("NCCL_CUMEM_ENABLE", "1")
+
+    mp.spawn(nccl_symm_mem_allreduce_worker,
+             args=(world_size, ),
+             nprocs=world_size)
+    cleanup_dist_env_and_memory()
--- a/vllm/compilation/cuda_graph.py
+++ b/vllm/compilation/cuda_graph.py
@ -12,6 +12,8 @@ import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.monitor import validate_cudagraph_capturing_enabled
 from vllm.config import CUDAGraphMode, VllmConfig
+from vllm.distributed.device_communicators.pynccl_allocator import (
+    set_graph_pool_id)
 from vllm.forward_context import BatchDescriptor, get_forward_context
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@ -154,6 +156,10 @@ class CUDAGraphWrapper:
                    stack.enter_context(
                        patch("torch.cuda.empty_cache", lambda: None))

+                if self.graph_pool is not None:
+                    set_graph_pool_id(self.graph_pool)
+                else:
+                    set_graph_pool_id(current_platform.graph_pool_handle())
                # mind-exploding: carefully manage the reference and memory.
                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
                    # `output` is managed by pytorch's cudagraph pool
--- a/vllm/distributed/device_communicators/all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/all_reduce_utils.py
@ -10,8 +10,9 @@ import sys
 import tempfile
 from collections.abc import Sequence
 from itertools import product
-from typing import Optional
+from typing import Any, Optional

+import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp

@ -56,6 +57,30 @@ SYMM_MEM_ALL_REDUCE_MAX_SIZES = {
    }
 }

+NCCL_SYMM_MEM_ALL_REDUCE_CONFIG: dict[str, Any] = {
+    "min_world_size": 4,
+    "thresholds": {
+        4: 2 * MiB,  # 2 MB
+        8: 1 * MiB,  # 1 MB
+    },
+    "always_use_above_world_size": 8  # Always use symm mem for world_size > 8
+}
+
+
+def should_nccl_symm_mem_allreduce(world_size: int,
+                                   input_tensor: torch.Tensor) -> bool:
+    from vllm.distributed.device_communicators.pynccl_allocator import (
+        is_symmetric_memory_enabled)
+    if not is_symmetric_memory_enabled():
+        return False
+    if world_size < NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["min_world_size"]:
+        return False
+    threshold = NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["thresholds"].get(world_size)
+    if threshold is not None and input_tensor.nbytes >= threshold:
+        return True
+    return (world_size
+            > NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["always_use_above_world_size"])
+

 def producer(batch_src: Sequence[int],
             producer_queue,
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@ -7,6 +7,12 @@ import torch
 from torch.distributed import ProcessGroup

 import vllm.envs as envs
+from vllm.distributed.device_communicators.all_reduce_utils import (
+    should_nccl_symm_mem_allreduce)
+from vllm.distributed.device_communicators.pynccl import (
+    register_nccl_symmetric_ops)
+from vllm.distributed.device_communicators.pynccl_allocator import (
+    is_symmetric_memory_enabled)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform

@ -53,6 +59,8 @@ class CudaCommunicator(DeviceCommunicatorBase):
                group=self.cpu_group,
                device=self.device,
            )
+            if is_symmetric_memory_enabled():
+                register_nccl_symmetric_ops(self.pynccl_comm)

        self.ca_comm: Optional[CustomAllreduce] = None
        self.qr_comm: Optional[QuickAllReduce] = None
@ -107,6 +115,13 @@ class CudaCommunicator(DeviceCommunicatorBase):
                raise ValueError(f"Unknown all2all backend: {all2all_backend}")

    def all_reduce(self, input_):
+        # since currently we perform copy input -> symm_input -> out-of-place AR
+        # return symm_output, we don't need to check if input is symmetric
+        if self.pynccl_comm is not None and \
+            should_nccl_symm_mem_allreduce(self.pynccl_comm.world_size,input_):
+            out = torch.ops.vllm.all_reduce_symmetric_with_copy(input_)
+            if out is not None:
+                return out
        # always try quick reduce first, then custom allreduce,
        # and then pynccl. (quick reduce just for ROCM MI3*)
        qr_comm = self.qr_comm
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@ -17,6 +17,39 @@ from vllm.utils import current_stream

 logger = init_logger(__name__)

+_NCCL_SYMM_OPS_REGISTERED = False
+
+
+def register_nccl_symmetric_ops(pynccl_comm):
+    from vllm.distributed.device_communicators.pynccl_allocator import (
+        nccl_symm_mem_context)
+    from vllm.utils import direct_register_custom_op
+
+    global _NCCL_SYMM_OPS_REGISTERED
+    if _NCCL_SYMM_OPS_REGISTERED:
+        return
+    _NCCL_SYMM_OPS_REGISTERED = True
+
+    def all_reduce_symmetric_with_copy_impl(
+            input_tensor: torch.Tensor) -> torch.Tensor:
+        with nccl_symm_mem_context(pynccl_comm):
+            symm_input = torch.empty_like(input_tensor)
+            symm_output = torch.empty_like(input_tensor)
+        symm_input.copy_(input_tensor)
+        symm_output = pynccl_comm.all_reduce(symm_input, symm_output)
+        return symm_output
+
+    def all_reduce_symmetric_with_copy_fake(
+            input_tensor: torch.Tensor) -> torch.Tensor:
+        return torch.empty_like(input_tensor)
+
+    direct_register_custom_op(
+        op_name="all_reduce_symmetric_with_copy",
+        op_func=all_reduce_symmetric_with_copy_impl,
+        mutates_args=[],
+        fake_impl=all_reduce_symmetric_with_copy_fake,
+    )
+

 class PyNcclCommunicator:

@ -67,6 +100,7 @@ class PyNcclCommunicator:
        self.available = True
        self.disabled = False

+        self.nccl_version = self.nccl.ncclGetRawVersion()
        logger.info("vLLM is using nccl==%s", self.nccl.ncclGetVersion())

        if self.rank == 0:
@ -109,6 +143,7 @@ class PyNcclCommunicator:

    def all_reduce(self,
                   in_tensor: torch.Tensor,
+                   out_tensor: torch.Tensor = None,
                   op: ReduceOp = ReduceOp.SUM,
                   stream=None) -> torch.Tensor:
        if self.disabled:
@ -120,7 +155,8 @@ class PyNcclCommunicator:
            f"this nccl communicator is created to work on {self.device}, "
            f"but the input tensor is on {in_tensor.device}")

-        out_tensor = torch.empty_like(in_tensor)
+        if out_tensor is None:
+            out_tensor = torch.empty_like(in_tensor)

        if stream is None:
            stream = current_stream()
@ -288,3 +324,18 @@ class PyNcclCommunicator:

    def group_end(self):
        self.nccl.ncclGroupEnd()
+
+    def register_comm_window(self, tensor: torch.Tensor):
+        return self.nccl.ncclCommWindowRegister(
+            self.comm,
+            buffer_type(tensor.data_ptr()),
+            tensor.numel() * tensor.element_size(),
+            1,
+        )
+
+    def register_comm_window_raw(self, ptr: int, size: int):
+        return self.nccl.ncclCommWindowRegister(self.comm, buffer_type(ptr),
+                                                size, 1)
+
+    def deregister_comm_window(self, window):
+        return self.nccl.ncclCommWindowDeregister(self.comm, window)
--- a/vllm/distributed/device_communicators/pynccl_allocator.py
+++ b/vllm/distributed/device_communicators/pynccl_allocator.py
@ -0,0 +1,186 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import atexit
+import contextlib
+import tempfile
+from typing import Any, Optional
+
+import torch
+from packaging import version
+from torch.cuda.memory import CUDAPluggableAllocator
+from torch.utils.cpp_extension import load_inline
+
+from vllm import envs
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import find_nccl_include_paths
+
+logger = init_logger(__name__)
+
+nccl_allocator_source = """
+#include <nccl.h>
+extern "C" {
+
+void* nccl_alloc_plug(size_t size, int device, void* stream) {
+  void* ptr;
+  ncclResult_t err = ncclMemAlloc(&ptr, size);
+  return ptr;
+
+}
+
+void nccl_free_plug(void* ptr, size_t size, int device, void* stream) {
+  ncclResult_t err = ncclMemFree(ptr);
+}
+
+}
+"""
+
+_allocator = None
+_allocator_wrapper = None
+_mem_pool = None
+_registered_base_addrs = set()
+_graph_pool_id = None
+_nccl_allocator_failed_to_compile = False
+_cached_pool_snapshot = None
+
+
+def is_symmetric_memory_enabled():
+    global _nccl_allocator_failed_to_compile
+    return envs.VLLM_USE_NCCL_SYMM_MEM and not _nccl_allocator_failed_to_compile
+
+
+def is_symmetric_memory_tensor(tensor: torch.Tensor):
+    if not is_symmetric_memory_enabled() or _cached_pool_snapshot is None:
+        return False
+    for segment in _cached_pool_snapshot:
+        for block in segment["blocks"]:
+            if block["address"] == tensor.untyped_storage().data_ptr():
+                return True
+    return False
+
+
+def set_graph_pool_id(graph_pool_id):
+    global _graph_pool_id
+    _graph_pool_id = graph_pool_id
+
+
+def compile_nccl_allocator():
+    global _allocator, _allocator_wrapper, _nccl_allocator_failed_to_compile
+    if not current_platform.is_cuda():
+        _nccl_allocator_failed_to_compile = True
+        return
+    try:
+        out_dir = tempfile.gettempdir()
+        nccl_allocator_libname = "nccl_allocator"
+        nccl_include_paths = find_nccl_include_paths()
+        load_inline(
+            name=nccl_allocator_libname,
+            cpp_sources=nccl_allocator_source,
+            with_cuda=True,
+            extra_ldflags=["-lnccl"],
+            verbose=envs.VLLM_LOGGING_LEVEL == "DEBUG",
+            is_python_module=False,
+            build_directory=out_dir,
+            extra_include_paths=nccl_include_paths,
+        )
+        _allocator_wrapper = CUDAPluggableAllocator(
+            f"{out_dir}/{nccl_allocator_libname}.so",
+            "nccl_alloc_plug",
+            "nccl_free_plug",
+        )
+        _allocator = _allocator_wrapper.allocator()
+    except Exception as e:
+        _nccl_allocator_failed_to_compile = True
+        logger.warning(
+            "Failed to compile NCCL memory allocator. "
+            "Symmetric memory will be disabled. "
+            "This is expected if NCCL headers are not available. "
+            "optionally set VLLM_NCCL_INCLUDE_PATH to point to a directory "
+            "containing the NCCL header. "
+            "Error: %s", str(e))
+
+
+def get_nccl_mem_pool():
+    global _mem_pool, _nccl_allocator_failed_to_compile
+    if _mem_pool is None and not _nccl_allocator_failed_to_compile:
+        compile_nccl_allocator()
+        if _allocator is not None:
+            _mem_pool = torch.cuda.MemPool(_allocator)
+    return _mem_pool
+
+
+def _cleanup_nccl_mem_pool():
+    global _mem_pool
+    _mem_pool = None
+
+
+def _cleanup_nccl_allocator_wrapper():
+    global _allocator_wrapper
+    _allocator_wrapper = None
+
+
+atexit.register(_cleanup_nccl_mem_pool)
+atexit.register(_cleanup_nccl_allocator_wrapper)
+
+
+class nccl_symm_mem_context:
+
+    def __init__(
+        self,
+        pynccl_comm: PyNcclCommunicator,
+        disabled: bool = False,
+    ):
+        self.disabled = (disabled or not is_symmetric_memory_enabled()
+                         or pynccl_comm.world_size == 1
+                         or not current_platform.is_cuda()
+                         or get_nccl_mem_pool() is None or version.parse(
+                             torch.__version__) < version.parse("2.8.0.a0"))
+        if self.disabled:
+            self.pynccl_comm: Optional[PyNcclCommunicator] = None
+            self._mem_pool_ctx: contextlib.AbstractContextManager[
+                Any] = contextlib.nullcontext()
+            self.is_graph_capture = None
+            self.device = None
+        else:
+            self.pynccl_comm = pynccl_comm
+            self._mem_pool_ctx = torch.cuda.use_mem_pool(get_nccl_mem_pool())
+            self.is_graph_capture = torch.cuda.is_current_stream_capturing()
+            self.device = torch.cuda.current_device()
+
+    def __enter__(self):
+        if self.disabled:
+            return self
+        assert (
+            self.pynccl_comm
+            is not None), "Symmetric memory requires pynccl to be initalized"
+        assert (
+            self.pynccl_comm.nccl_version >= 22703
+        ), "NCCL version 2.27.3 or higher is required for NCCL symmetric memory"
+        if self.is_graph_capture:
+            assert (
+                _graph_pool_id
+                is not None), "graph_pool_id is not set under graph capture"
+            # Pause graph memory pool to use symmetric memory with cuda graph
+            torch._C._cuda_endAllocateToPool(self.device, _graph_pool_id)
+        self._mem_pool_ctx.__enter__()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.disabled:
+            return
+        global _cached_pool_snapshot
+        global _registered_base_addrs
+        self._mem_pool_ctx.__exit__(exc_type, exc_val, exc_tb)
+        _pool = get_nccl_mem_pool()
+        assert _pool is not None
+        _cached_pool_snapshot = _pool.snapshot()
+        assert self.pynccl_comm is not None
+        for segment in _cached_pool_snapshot:
+            if segment["address"] not in _registered_base_addrs:
+                self.pynccl_comm.register_comm_window_raw(
+                    segment["address"], segment["total_size"])
+                _registered_base_addrs.add(segment["address"])
+        if self.is_graph_capture:
+            torch._C._cuda_beginAllocateCurrentThreadToPool(
+                self.device, _graph_pool_id)
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@ -41,6 +41,7 @@ logger = init_logger(__name__)

 ncclResult_t = ctypes.c_int
 ncclComm_t = ctypes.c_void_p
+ncclWindow_t = ctypes.c_void_p


 class ncclUniqueId(ctypes.Structure):
@ -222,6 +223,24 @@ class NCCLLibrary:
        Function("ncclGroupStart", ncclResult_t, []),
        # ncclResult_t ncclGroupEnd();
        Function("ncclGroupEnd", ncclResult_t, []),
+        # ncclResult_t ncclCommWindowRegister(
+        #   ncclComm_t comm, void* buff, size_t size,
+        #   ncclWindow_t* win, int winFlags);
+        Function(
+            "ncclCommWindowRegister",
+            ncclResult_t,
+            [
+                ncclComm_t,
+                buffer_type,
+                ctypes.c_size_t,
+                ctypes.POINTER(ncclWindow_t),
+                ctypes.c_int,
+            ],
+        ),
+        # ncclResult_t ncclCommWindowDeregister(
+        #   ncclComm_t comm, ncclWindow_t win);
+        Function("ncclCommWindowDeregister", ncclResult_t,
+                 [ncclComm_t, ncclWindow_t]),
    ]

    # class attribute to store the mapping from the path to the library
@ -271,10 +290,14 @@ class NCCLLibrary:
            error_str = self.ncclGetErrorString(result)
            raise RuntimeError(f"NCCL error: {error_str}")

-    def ncclGetVersion(self) -> str:
+    def ncclGetRawVersion(self) -> int:
        version = ctypes.c_int()
        self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version)))
-        version_str = str(version.value)
+        # something like 21903
+        return version.value
+
+    def ncclGetVersion(self) -> str:
+        version_str = str(self.ncclGetRawVersion())
        # something like 21903 --> "2.19.3"
        major = version_str[0].lstrip("0")
        minor = version_str[1:3].lstrip("0")
@ -375,6 +398,17 @@ class NCCLLibrary:
    def ncclGroupEnd(self) -> None:
        self.NCCL_CHECK(self._funcs["ncclGroupEnd"]())

+    def ncclCommWindowRegister(self, comm: ncclComm_t, buff: buffer_type,
+                               size: int, win_flags: int) -> ncclWindow_t:
+        window = ncclWindow_t()
+        self.NCCL_CHECK(self._funcs["ncclCommWindowRegister"](
+            comm, buff, size, ctypes.byref(window), win_flags))
+        return window
+
+    def ncclCommWindowDeregister(self, comm: ncclComm_t,
+                                 window: ncclWindow_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclCommWindowDeregister"](comm, window))
+

 __all__ = [
    "NCCLLibrary", "ncclDataTypeEnum", "ncclRedOpTypeEnum", "ncclUniqueId",
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -193,6 +193,8 @@ if TYPE_CHECKING:
    VLLM_DBO_COMM_SMS: int = 20
    GPT_OSS_SYSTEM_TOOL_MCP_LABELS: list[str] = []
    VLLM_PATTERN_MATCH_DEBUG: Optional[str] = None
+    VLLM_USE_NCCL_SYMM_MEM: bool = False
+    VLLM_NCCL_INCLUDE_PATH: Optional[str] = None


 def get_default_cache_root():
@ -1410,6 +1412,15 @@ environment_variables: dict[str, Callable[[], Any]] = {
                            ["container",
                            "code_interpreter",
                            "web_search_preview"]),
+
+    # Flag to enable NCCL symmetric memory allocation and registration
+    "VLLM_USE_NCCL_SYMM_MEM":
+    lambda: bool(int(os.getenv("VLLM_USE_NCCL_SYMM_MEM", "0"))),
+
+    # NCCL header path
+    "VLLM_NCCL_INCLUDE_PATH":
+    lambda: os.environ.get("VLLM_NCCL_INCLUDE_PATH", None),
+
 }

 # --8<-- [end:env-vars-definition]
--- a/vllm/utils/init.py
+++ b/vllm/utils/init.py
@ -1383,6 +1383,38 @@ def find_nccl_library() -> str:
    return so_file


+def find_nccl_include_paths() -> Optional[list[str]]:
+    """
+    We either use the nccl.h specified by the `VLLM_NCCL_INCLUDE_PATH`
+    environment variable, or we find the library file brought by 
+    nvidia-nccl-cuXX. load_inline by default uses 
+    torch.utils.cpp_extension.include_paths
+    """
+    paths: list[str] = []
+    inc = envs.VLLM_NCCL_INCLUDE_PATH
+    if inc and os.path.isdir(inc):
+        paths.append(inc)
+
+    try:
+        import importlib.util
+        spec = importlib.util.find_spec("nvidia.nccl")
+        if spec and getattr(spec, "submodule_search_locations", None):
+            for loc in spec.submodule_search_locations:
+                inc_dir = os.path.join(loc, "include")
+                if os.path.exists(os.path.join(inc_dir, "nccl.h")):
+                    paths.append(inc_dir)
+    except Exception:
+        pass
+
+    seen = set()
+    out: list[str] = []
+    for p in paths:
+        if p and p not in seen:
+            out.append(p)
+            seen.add(p)
+    return out or None
+
+
 prev_set_stream = torch.cuda.set_stream

 _current_stream_tls = threading.local()
--- a/vllm/v1/worker/gpu_ubatch_wrapper.py
+++ b/vllm/v1/worker/gpu_ubatch_wrapper.py
@ -11,6 +11,8 @@ import vllm.envs as envs
 from vllm.compilation.cuda_graph import CUDAGraphWrapper
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.distributed import get_ep_group
+from vllm.distributed.device_communicators.pynccl_allocator import (
+    set_graph_pool_id)
 from vllm.forward_context import (create_forward_context, get_forward_context,
                                  override_forward_context)
 from vllm.logger import init_logger
@ -206,6 +208,10 @@ class UBatchWrapper:
                            cudagraph=torch.cuda.CUDAGraph(),
                            ubatch_metadata=ubatch_metadata,
                        )
+            if self.graph_pool is not None:
+                set_graph_pool_id(self.graph_pool)
+            else:
+                set_graph_pool_id(current_platform.graph_pool_handle())
            with torch.cuda.graph(cudagraph_metadata.cudagraph,
                                  stream=compute_stream,
                                  pool=self.graph_pool):