From b706d898af7c55dc854858bace3c9041cf22da66 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Tue, 11 Mar 2025 16:40:07 -0700
Subject: [PATCH] [Bugfix][V1][PP] Only warmup sampler at last PP rank (#14643)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
---
 vllm/v1/worker/gpu_worker.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 040a27de9480a..5527a105f8670 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -14,6 +14,7 @@ from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
+from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
@@ -219,20 +220,22 @@ class Worker(WorkerBase):
         # fragmentation issue.
         # NOTE: This is called after `capture_model` on purpose to prevent
         # memory buffers from being cleared by `torch.cuda.empty_cache`.
-        try:
-            max_num_reqs = min(self.scheduler_config.max_num_seqs,
-                               self.scheduler_config.max_num_batched_tokens)
-            self.model_runner._dummy_sampler_run(
-                hidden_states=self.model_runner._dummy_run(
-                    num_tokens=max_num_reqs))
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                raise RuntimeError(
-                    "CUDA out of memory occurred when warming up sampler. "
-                    "Please try lowering `gpu_memory_utilization` when "
-                    "initializing the engine.") from None
-            else:
-                raise e
+        if get_pp_group().is_last_rank:
+            try:
+                max_num_reqs = min(
+                    self.scheduler_config.max_num_seqs,
+                    self.scheduler_config.max_num_batched_tokens)
+                self.model_runner._dummy_sampler_run(
+                    hidden_states=self.model_runner._dummy_run(
+                        num_tokens=max_num_reqs))
+            except RuntimeError as e:
+                if 'out of memory' in str(e):
+                    raise RuntimeError(
+                        "CUDA out of memory occurred when warming up sampler. "
+                        "Please try lowering `gpu_memory_utilization` when "
+                        "initializing the engine.") from None
+                else:
+                    raise e
 
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.