From e251e457c5848309ba62c1f772d4d7d218ee1dbc Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 13 Oct 2025 14:06:57 -0400 Subject: [PATCH] [Log] Optimize Startup Log (#26601) Signed-off-by: yewentao256 --- .../device_communicators/cuda_communicator.py | 13 +++++++------ vllm/distributed/device_communicators/pynccl.py | 3 +-- vllm/utils/__init__.py | 2 +- vllm/v1/attention/backends/mla/cutlass_mla.py | 2 +- vllm/v1/engine/core.py | 12 +++++++----- vllm/v1/worker/gpu_model_runner.py | 1 - 6 files changed, 17 insertions(+), 16 deletions(-) diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index c5c13debddb50..39b02311fe873 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -13,6 +13,7 @@ from vllm.distributed.device_communicators.pynccl import register_nccl_symmetric from vllm.distributed.device_communicators.pynccl_allocator import ( is_symmetric_memory_enabled, ) +from vllm.distributed.parallel_state import is_global_first_rank from vllm.logger import init_logger from vllm.platforms import current_platform @@ -95,35 +96,35 @@ class CudaCommunicator(DeviceCommunicatorBase): from .all2all import NaiveAll2AllManager self.all2all_manager = NaiveAll2AllManager(self.cpu_group) - logger.info("Using naive all2all manager.") elif all2all_backend == "allgather_reducescatter": from .all2all import AgRsAll2AllManager self.all2all_manager = AgRsAll2AllManager(self.cpu_group) - logger.info("Using AllGather-ReduceScatter all2all manager.") elif all2all_backend == "pplx": from .all2all import PPLXAll2AllManager self.all2all_manager = PPLXAll2AllManager(self.cpu_group) - logger.info("Using PPLX all2all manager.") elif all2all_backend == "deepep_high_throughput": from .all2all import DeepEPHTAll2AllManager self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group) - logger.info("Using DeepEP High-Throughput all2all manager.") elif all2all_backend == "deepep_low_latency": from .all2all import DeepEPLLAll2AllManager self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group) - logger.info("Using DeepEP Low-Latency all2all manager.") elif all2all_backend == "flashinfer_all2allv": from .all2all import FlashInferAllToAllManager self.all2all_manager = FlashInferAllToAllManager(self.cpu_group) - logger.info("Using Flashinfer all2allv manager.") else: raise ValueError(f"Unknown all2all backend: {all2all_backend}") + if is_global_first_rank(): + logger.info( + "Using %s all2all manager.", + self.all2all_manager.__class__.__name__, + ) + def all_reduce(self, input_): # since currently we perform copy input -> symm_input -> out-of-place AR # return symm_output, we don't need to check if input is symmetric diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 9b293d584a0a2..f083308791781 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -105,11 +105,10 @@ class PyNcclCommunicator: self.disabled = False self.nccl_version = self.nccl.ncclGetRawVersion() - logger.info("vLLM is using nccl==%s", self.nccl.ncclGetVersion()) - if self.rank == 0: # get the unique id from NCCL self.unique_id = self.nccl.ncclGetUniqueId() + logger.info("vLLM is using nccl==%s", self.nccl.ncclGetVersion()) else: # construct an empty unique id self.unique_id = ncclUniqueId() diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index c31c1ab0309c2..c8da83047a406 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1144,7 +1144,7 @@ def find_nccl_library() -> str: so_file = "librccl.so.1" else: raise ValueError("NCCL only supports CUDA and ROCm backends.") - logger.info("Found nccl from library %s", so_file) + logger.debug_once("Found nccl from library %s", so_file) return so_file diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index bd52de07d2739..c35e238eac4c0 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -139,7 +139,7 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): # FORCE_NUM_KV_SPLITS=1 force_num_kv_splits = os.environ.get("FORCE_NUM_KV_SPLITS", None) if force_num_kv_splits: - logger.warning_once("Forcing num_kv_splits to %d", int(force_num_kv_splits)) + logger.debug_once("Forcing num_kv_splits to %d", int(force_num_kv_splits)) self._num_kv_splits = int(force_num_kv_splits) else: self._num_kv_splits = -1 # => Auto-detect diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 9aa4d459e2104..2c5d0fdc752ed 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -19,6 +19,7 @@ import zmq from vllm.config import ParallelConfig, VllmConfig from vllm.distributed import stateless_destroy_torch_distributed_process_group +from vllm.distributed.parallel_state import is_global_first_rank from vllm.logger import init_logger from vllm.logging_utils.dump_input import dump_engine_exception from vllm.lora.request import LoRARequest @@ -91,11 +92,12 @@ class EngineCore: load_general_plugins() self.vllm_config = vllm_config - logger.info( - "Initializing a V1 LLM engine (v%s) with config: %s", - VLLM_VERSION, - vllm_config, - ) + if is_global_first_rank(): + logger.info( + "Initializing a V1 LLM engine (v%s) with config: %s", + VLLM_VERSION, + vllm_config, + ) self.log_stats = log_stats diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 09e66a12d14f2..5c2893bd09266 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2876,7 +2876,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): with DeviceMemoryProfiler() as m: time_before_load = time.perf_counter() model_loader = get_model_loader(self.load_config) - logger.info("Loading model from scratch...") self.model = model_loader.load_model( vllm_config=self.vllm_config, model_config=self.model_config )