From b706d898af7c55dc854858bace3c9041cf22da66 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Tue, 11 Mar 2025 16:40:07 -0700 Subject: [PATCH] [Bugfix][V1][PP] Only warmup sampler at last PP rank (#14643) Signed-off-by: Cody Yu --- vllm/v1/worker/gpu_worker.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 040a27de9480a..5527a105f8670 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -14,6 +14,7 @@ from vllm.device_allocator.cumem import CuMemAllocator from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) +from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed @@ -219,20 +220,22 @@ class Worker(WorkerBase): # fragmentation issue. # NOTE: This is called after `capture_model` on purpose to prevent # memory buffers from being cleared by `torch.cuda.empty_cache`. - try: - max_num_reqs = min(self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens) - self.model_runner._dummy_sampler_run( - hidden_states=self.model_runner._dummy_run( - num_tokens=max_num_reqs)) - except RuntimeError as e: - if 'out of memory' in str(e): - raise RuntimeError( - "CUDA out of memory occurred when warming up sampler. " - "Please try lowering `gpu_memory_utilization` when " - "initializing the engine.") from None - else: - raise e + if get_pp_group().is_last_rank: + try: + max_num_reqs = min( + self.scheduler_config.max_num_seqs, + self.scheduler_config.max_num_batched_tokens) + self.model_runner._dummy_sampler_run( + hidden_states=self.model_runner._dummy_run( + num_tokens=max_num_reqs)) + except RuntimeError as e: + if 'out of memory' in str(e): + raise RuntimeError( + "CUDA out of memory occurred when warming up sampler. " + "Please try lowering `gpu_memory_utilization` when " + "initializing the engine.") from None + else: + raise e # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling.