diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index e559fdb397fa3..ea1963e5f2cd3 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -240,9 +240,9 @@ class CompilerManager: if graph_index == 0: # adds some info logging for the first graph if runtime_shape is None: - logger.info("Cache the graph for dynamic shape for later use") + logger.debug("Cache the graph for dynamic shape for later use") else: - logger.info( + logger.debug( "Cache the graph of shape %s for later use", str(runtime_shape) ) if runtime_shape is None: @@ -601,7 +601,7 @@ class VllmBackend: if disable_cache: logger.info("vLLM's torch.compile cache is disabled.") else: - logger.info( + logger.debug( "Using cache directory: %s for vLLM's torch.compile", local_cache_dir ) @@ -615,7 +615,7 @@ class VllmBackend: from .monitor import torch_compile_start_time dynamo_time = time.time() - torch_compile_start_time - logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time) + logger.debug("Dynamo bytecode transform time: %.2f s", dynamo_time) self.compilation_config.compilation_time += dynamo_time # we control the compilation process, each instance can only be diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 4bc737494cb5b..d1ea3e6748b7f 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -34,7 +34,7 @@ def _can_p2p(rank: int, world_size: int) -> bool: if i == rank: continue if envs.VLLM_SKIP_P2P_CHECK: - logger.info("Skipping P2P check and trusting the driver's P2P report.") + logger.debug("Skipping P2P check and trusting the driver's P2P report.") return torch.cuda.can_device_access_peer(rank, i) if not gpu_p2p_access_check(rank, i): return False diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index cd201503bf17d..7bcfb8e207af1 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -299,7 +299,7 @@ class MessageQueue: remote_addr_ipv6=remote_addr_ipv6, ) - logger.info("vLLM message queue communication handle: %s", self.handle) + logger.debug("vLLM message queue communication handle: %s", self.handle) def export_handle(self) -> Handle: return self.handle diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 67a8c6f7c053f..45044f73d1ea7 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1157,7 +1157,7 @@ def init_distributed_environment( ip = parallel_config.data_parallel_master_ip port = parallel_config.get_next_dp_init_port() distributed_init_method = get_distributed_init_method(ip, port) - logger.info( + logger.debug( "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP", world_size, rank, @@ -1322,7 +1322,7 @@ def initialize_model_parallel( group_ranks, get_world_group().local_rank, backend, group_name="ep" ) - logger.info( + logger.debug( "rank %s in world size %s is assigned as " "DP rank %s, PP rank %s, TP rank %s, EP rank %s", rank, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 9b117f3b5d418..499c4573cacd5 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1137,7 +1137,7 @@ class FusedMoE(CustomOp): ) self.local_num_experts = local_num_experts self.register_buffer("expert_map", expert_map) - logger.info_once( + logger.debug_once( "[EP Rank %s/%s] Expert parallelism is enabled. Expert " "placement strategy: %s. Local/global" " number of experts: %s/%s. Experts local to global index map:" diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index c97de1aa45964..04997923fe985 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -13,6 +13,7 @@ from transformers.utils import SAFE_WEIGHTS_INDEX_NAME from vllm.config import ModelConfig from vllm.config.load import LoadConfig +from vllm.distributed.parallel_state import is_global_first_rank from vllm.logger import init_logger from vllm.model_executor.layers.quantization.torchao import torchao_version_at_least from vllm.model_executor.model_loader.base_loader import BaseModelLoader @@ -311,10 +312,12 @@ class DefaultModelLoader(BaseModelLoader): loaded_weights = load_weights_and_online_quantize(self, model, model_config) self.counter_after_loading_weights = time.perf_counter() - logger.info( - "Loading weights took %.2f seconds", - self.counter_after_loading_weights - self.counter_before_loading_weights, - ) + if is_global_first_rank(): + logger.info( + "Loading weights took %.2f seconds", + self.counter_after_loading_weights + - self.counter_before_loading_weights, + ) # We only enable strict check for non-quantized models # that have loaded weights tracking currently. if model_config.quantization is None and loaded_weights is not None: diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index c2d68029f4c71..341c2960f51db 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -416,7 +416,7 @@ def download_weights_from_hf( e, ) - logger.info("Using model weights format %s", allow_patterns) + logger.debug("Using model weights format %s", allow_patterns) # Use file lock to prevent multiple processes from # downloading the same model weights at the same time. with get_lock(model_name_or_path, cache_dir): diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index d63ef78f5b2d2..7daeb08e25557 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -221,10 +221,12 @@ def resolve_current_platform_cls_qualname() -> str: ) elif len(activated_builtin_plugins) == 1: platform_cls_qualname = builtin_platform_plugins[activated_builtin_plugins[0]]() - logger.info("Automatically detected platform %s.", activated_builtin_plugins[0]) + logger.debug( + "Automatically detected platform %s.", activated_builtin_plugins[0] + ) else: platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform" - logger.info("No platform detected, vLLM is running on UnspecifiedPlatform") + logger.debug("No platform detected, vLLM is running on UnspecifiedPlatform") return platform_cls_qualname diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py index 99c19c9db28e9..10527f79eeaff 100644 --- a/vllm/utils/gc_utils.py +++ b/vllm/utils/gc_utils.py @@ -37,7 +37,7 @@ class GCDebugConfig: except Exception: self.enabled = False logger.error("Failed to parse VLLM_GC_DEBUG(%s)", VLLM_GC_DEBUG) - logger.info("GC Debug Config. %s", str(self)) + logger.debug("GC Debug Config. %s", str(self)) def __repr__(self) -> str: return f"enabled:{self.enabled},top_objects:{self.top_objects}" diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 2c5d0fdc752ed..23a3fd50d2312 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -19,7 +19,6 @@ import zmq from vllm.config import ParallelConfig, VllmConfig from vllm.distributed import stateless_destroy_torch_distributed_process_group -from vllm.distributed.parallel_state import is_global_first_rank from vllm.logger import init_logger from vllm.logging_utils.dump_input import dump_engine_exception from vllm.lora.request import LoRARequest @@ -92,7 +91,7 @@ class EngineCore: load_general_plugins() self.vllm_config = vllm_config - if is_global_first_rank(): + if vllm_config.parallel_config.data_parallel_rank == 0: logger.info( "Initializing a V1 LLM engine (v%s) with config: %s", VLLM_VERSION, @@ -726,7 +725,6 @@ class EngineCoreProc(EngineCore): ) # Receive initialization message. - logger.info("Waiting for init message from front-end.") if not handshake_socket.poll(timeout=HANDSHAKE_TIMEOUT_MINS * 60_000): raise RuntimeError( "Did not receive response from front-end " diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 8c5abae2ae652..2bf7492a36c51 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -170,7 +170,7 @@ class LoggingStatLogger(StatLoggerBase): def log_engine_initialized(self): if self.vllm_config.cache_config.num_gpu_blocks: - logger.info( + logger.debug( "Engine %03d: vllm cache_config_info with initialization " "after num_gpu_blocks is: %d", self.engine_index, diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index ed8bc55a3cf2f..4e8d8221eb5ab 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -8,6 +8,7 @@ from packaging import version from vllm import envs from vllm.config.model import LogprobsMode +from vllm.distributed.parallel_state import is_global_first_rank from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform @@ -55,7 +56,8 @@ class TopKTopPSampler(nn.Module): # None means False, while in V1, None means True. This is # why we use the condition # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here. - logger.info_once("Using FlashInfer for top-p & top-k sampling.") + if is_global_first_rank(): + logger.info_once("Using FlashInfer for top-p & top-k sampling.") self.forward = self.forward_cuda else: logger.warning_once( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5c2893bd09266..f05e59fb65c41 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2846,7 +2846,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): Args: eep_scale_up: the model loading is for elastic EP scale up. """ - logger.info("Starting to load model %s...", self.model_config.model) + if is_global_first_rank(): + logger.info_once("Starting to load model %s...", self.model_config.model) if eep_scale_up: from vllm.distributed.parallel_state import get_ep_group diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 0e9ab3f9148b9..5ea4e4d909c15 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -20,7 +20,10 @@ from vllm.distributed import ( set_custom_all_reduce, ) from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized -from vllm.distributed.parallel_state import get_pp_group, get_tp_group +from vllm.distributed.parallel_state import ( + get_pp_group, + get_tp_group, +) from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed