[BugFix] Fix OOM in vLLM replicas by ensuring consistent NCCL memory accounting (#25359)

Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
2025-12-10 04:15:01 +08:00 · 2025-09-23 15:49:09 -07:00 · 2025-09-23 15:49:09 -07:00 · abad204be6
commit abad204be6
parent 7361ab379f
3 changed files with 191 additions and 7 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -887,6 +887,8 @@ steps:
  - tests/v1/test_external_lb_dp.py
  - tests/v1/entrypoints/openai/test_multi_api_servers.py
  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
@ -908,6 +910,7 @@ steps:
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s models/multimodal/generation/test_maverick.py
+  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

 - label: Plugin Tests (2 GPUs) # 40min
  timeout_in_minutes: 60
--- a/tests/v1/worker/test_worker_memory_snapshot.py
+++ b/tests/v1/worker/test_worker_memory_snapshot.py
@ -0,0 +1,174 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import multiprocessing as mp
+import os
+import tempfile
+from multiprocessing import Queue
+from typing import Optional
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import MemorySnapshot
+from vllm.v1.worker.gpu_worker import (Worker,
+                                       init_worker_distributed_environment)
+
+# Global queue to track operation order across processes
+_QUEUE: Optional[Queue] = None
+
+
+def track_operation(operation: str, rank: int):
+    """Track when an operation happens and its rank."""
+    if _QUEUE is not None:
+        _QUEUE.put((operation, rank))
+
+
+def make_operation_tracker(operation_name: str, original_func):
+    """Create a mock function that tracks when an operation is called.
+    
+    Args:
+        operation_name: Name to use when tracking this operation
+        original_func: The original function to wrap
+    
+    Returns:
+        A wrapper function that tracks the operation and calls the original
+    """
+
+    def wrapper(*args, **kwargs):
+        rank = int(os.environ.get("RANK", "-1"))
+        track_operation(operation_name, rank)
+        return original_func(*args, **kwargs)
+
+    return wrapper
+
+
+def worker_process(rank: int, world_size: int, distributed_init_method: str,
+                   queue: Queue, error_queue: Queue):
+    """Worker process that initializes a GPU worker with proper tracking."""
+    global _QUEUE
+    _QUEUE = queue
+
+    try:
+        # Set environment variables
+        os.environ["RANK"] = str(rank)
+        os.environ["LOCAL_RANK"] = str(rank)
+        os.environ["WORLD_SIZE"] = str(world_size)
+
+        # Create vLLM config with small model
+        vllm_config = EngineArgs(model="facebook/opt-125m",
+                                 tensor_parallel_size=2,
+                                 load_format="dummy").create_engine_config()
+
+        # Create worker
+        worker = Worker(
+            vllm_config=vllm_config,
+            local_rank=rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+        )
+
+        # Get original functions before patching
+        original_init_worker = init_worker_distributed_environment
+        original_memory_snapshot_init = MemorySnapshot.__init__
+        original_all_reduce = torch.distributed.all_reduce
+
+        # Apply minimal patches to track operation order
+        init_patch = patch(
+            'vllm.v1.worker.gpu_worker.init_worker_distributed_environment',
+            side_effect=make_operation_tracker("init_distributed",
+                                               original_init_worker))
+        memory_patch = patch.object(
+            MemorySnapshot, '__init__',
+            make_operation_tracker("memory_snapshot",
+                                   original_memory_snapshot_init))
+        all_reduce_patch = patch('torch.distributed.all_reduce',
+                                 side_effect=make_operation_tracker(
+                                     "nccl_all_reduce", original_all_reduce))
+
+        with init_patch, memory_patch, all_reduce_patch:
+
+            # Initialize device (this is where we test the order)
+            worker.init_device()
+
+            # Load model to ensure everything works
+            worker.load_model()
+
+        # Signal success
+        queue.put(("success", rank))
+
+    except Exception as e:
+        error_queue.put((rank, str(e), type(e).__name__))
+        raise
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs for tensor parallelism")
+def test_init_distributed_is_called_before_memory_snapshot():
+    """Test that distributed env is setup before memory snapshot.
+    
+    This test makes sure during worker initialization, the initial memory 
+    snapshot is taken after distributed env is setup to include all the buffers 
+    allocated by distributed env.
+    """
+    world_size = 2
+
+    # Create a temporary file for distributed init
+    with tempfile.NamedTemporaryFile(delete=False) as f:
+        distributed_init_method = f"file://{f.name}"
+
+    # Create queues for inter-process communication
+    ctx = mp.get_context("spawn")
+    operation_queue = ctx.Queue()
+    error_queue = ctx.Queue()
+
+    # Start worker processes
+    processes = []
+    for rank in range(world_size):
+        p = ctx.Process(target=worker_process,
+                        args=(rank, world_size, distributed_init_method,
+                              operation_queue, error_queue))
+        p.start()
+        processes.append(p)
+
+    # Wait for all processes to complete
+    for p in processes:
+        p.join(timeout=60)  # 60 second timeout
+
+    # Check for errors
+    errors = []
+    while not error_queue.empty():
+        rank, error_msg, error_type = error_queue.get()
+        errors.append(f"Rank {rank}: {error_type}: {error_msg}")
+
+    if errors:
+        pytest.fail("Worker processes failed:\n" + "\n".join(errors))
+
+    # Collect all operations from the queue
+    operations = []
+    while not operation_queue.empty():
+        operations.append(operation_queue.get())
+
+    # Verify we got operations from both ranks
+    print(f"Collected operations: {operations}")
+
+    # Check operations for each rank
+    for rank in range(world_size):
+        rank_ops = [op for op, r in operations if r == rank]
+        print(f"\nRank {rank} operations: {rank_ops}")
+
+        # Raises ValueError if the operation is not found
+        init_distributed = rank_ops.index("init_distributed")
+        nccl_all_reduce = rank_ops.index("nccl_all_reduce")
+        memory_snapshot = rank_ops.index("memory_snapshot")
+
+        # Verify order: init_distributed should happen before memory_snapshot
+        assert init_distributed < nccl_all_reduce < memory_snapshot, (
+            f"Rank {rank}: init_distributed (index {init_distributed}) "
+            f"must happen before nccl_all_reduce (index {nccl_all_reduce}) "
+            f"and memory_snapshot (index {memory_snapshot})")
+
+    # Clean up
+    os.unlink(distributed_init_method.replace("file://", ""))
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@ -169,6 +169,20 @@ class Worker(WorkerBase):
            current_platform.set_device(self.device)

            current_platform.check_if_supports_dtype(self.model_config.dtype)
+
+            # Initialize the distributed environment BEFORE taking
+            # memory snapshot
+            # This ensures NCCL buffers are allocated before we measure
+            # available memory
+            init_worker_distributed_environment(self.vllm_config, self.rank,
+                                                self.distributed_init_method,
+                                                self.local_rank,
+                                                current_platform.dist_backend)
+
+            # Set random seed.
+            set_random_seed(self.model_config.seed)
+
+            # Now take memory snapshot after NCCL is initialized
            gc.collect()
            torch.cuda.empty_cache()

@ -190,13 +204,6 @@ class Worker(WorkerBase):
        else:
            raise RuntimeError(
                f"Not support device type: {self.device_config.device}")
-        # Initialize the distributed environment.
-        init_worker_distributed_environment(self.vllm_config, self.rank,
-                                            self.distributed_init_method,
-                                            self.local_rank,
-                                            current_platform.dist_backend)
-        # Set random seed.
-        set_random_seed(self.model_config.seed)

        # Construct the model runner
        self.model_runner: GPUModelRunner = GPUModelRunner(