From 8d6a89dffd9e2e238b3c6ead964783cfdb053756 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 25 Nov 2025 20:19:35 -0500 Subject: [PATCH] [UX] Suppress gloo log spam (#29250) Signed-off-by: mgoin --- vllm/distributed/parallel_state.py | 4 ++- vllm/distributed/utils.py | 52 ++++++++++++++++-------------- vllm/utils/system_utils.py | 33 +++++++++++++++++++ 3 files changed, 63 insertions(+), 26 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index f81612fd1f4a3..69c28e278f2d2 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -51,6 +51,7 @@ from vllm.distributed.utils import StatelessProcessGroup from vllm.logger import init_logger from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.network_utils import get_distributed_init_method +from vllm.utils.system_utils import suppress_stdout from vllm.utils.torch_utils import ( direct_register_custom_op, supports_custom_op, @@ -329,7 +330,8 @@ class GroupCoordinator: ) # a group with `gloo` backend, to allow direct coordination between # processes through the CPU. - cpu_group = torch.distributed.new_group(ranks, backend="gloo") + with suppress_stdout(): + cpu_group = torch.distributed.new_group(ranks, backend="gloo") if self.rank in ranks: self.ranks = ranks self.world_size = len(ranks) diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index debf69c49b7d9..242ce393e4dc8 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -30,6 +30,7 @@ from torch.distributed.rendezvous import rendezvous import vllm.envs as envs from vllm.logger import init_logger from vllm.utils.network_utils import get_tcp_uri +from vllm.utils.system_utils import suppress_stdout from vllm.utils.torch_utils import is_torch_equal_or_newer logger = init_logger(__name__) @@ -427,33 +428,34 @@ def init_gloo_process_group( Stateless init ProcessGroup with gloo backend compatible with different torch versions. """ - if is_torch_equal_or_newer("2.6"): - pg = ProcessGroup( - prefix_store, - group_rank, - group_size, - ) - else: - options = ProcessGroup.Options(backend="gloo") - pg = ProcessGroup( - prefix_store, - group_rank, - group_size, - options, - ) - from torch.distributed.distributed_c10d import ProcessGroupGloo + with suppress_stdout(): + if is_torch_equal_or_newer("2.6"): + pg = ProcessGroup( + prefix_store, + group_rank, + group_size, + ) + else: + options = ProcessGroup.Options(backend="gloo") + pg = ProcessGroup( + prefix_store, + group_rank, + group_size, + options, + ) + from torch.distributed.distributed_c10d import ProcessGroupGloo - backend_class = ProcessGroupGloo( - prefix_store, group_rank, group_size, timeout=timeout - ) - backend_type = ProcessGroup.BackendType.GLOO - device = torch.device("cpu") - if is_torch_equal_or_newer("2.6"): - # _set_default_backend is supported in torch >= 2.6 - pg._set_default_backend(backend_type) - backend_class._set_sequence_number_for_group() + backend_class = ProcessGroupGloo( + prefix_store, group_rank, group_size, timeout=timeout + ) + backend_type = ProcessGroup.BackendType.GLOO + device = torch.device("cpu") + if is_torch_equal_or_newer("2.6"): + # _set_default_backend is supported in torch >= 2.6 + pg._set_default_backend(backend_type) + backend_class._set_sequence_number_for_group() - pg._register_backend(device, backend_type, backend_class) + pg._register_backend(device, backend_type, backend_class) return pg diff --git a/vllm/utils/system_utils.py b/vllm/utils/system_utils.py index cc872040b6c5f..a4eb8f4d4fd7d 100644 --- a/vllm/utils/system_utils.py +++ b/vllm/utils/system_utils.py @@ -56,6 +56,39 @@ def set_env_var(key: str, value: str) -> Iterator[None]: os.environ[key] = old +@contextlib.contextmanager +def suppress_stdout(): + """ + Suppress stdout from C libraries at the file descriptor level. + + Only suppresses stdout, not stderr, to preserve error messages. + Suppression is disabled when VLLM_LOGGING_LEVEL is set to DEBUG. + + Example: + with suppress_stdout(): + # C library calls that would normally print to stdout + torch.distributed.new_group(ranks, backend="gloo") + """ + # Don't suppress if logging level is DEBUG + if envs.VLLM_LOGGING_LEVEL == "DEBUG": + yield + return + + stdout_fd = sys.stdout.fileno() + stdout_dup = os.dup(stdout_fd) + devnull_fd = os.open(os.devnull, os.O_WRONLY) + + try: + sys.stdout.flush() + os.dup2(devnull_fd, stdout_fd) + yield + finally: + sys.stdout.flush() + os.dup2(stdout_dup, stdout_fd) + os.close(stdout_dup) + os.close(devnull_fd) + + # File path utilities