From 30bab971c02f34971da93f9834f3789ff48a2511 Mon Sep 17 00:00:00 2001 From: ilmarkov Date: Wed, 26 Nov 2025 14:11:19 +0000 Subject: [PATCH] Edit config and fix config post_init Signed-off-by: ilmarkov --- vllm/config/parallel.py | 4 ++++ vllm/distributed/eplb/eplb_state.py | 10 ++++++++-- vllm/distributed/eplb/rebalance_execute.py | 6 +++--- vllm/engine/arg_utils.py | 8 ++++---- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 7ba1da5db3849..44b89c3d24cbe 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -60,6 +60,10 @@ class EPLBConfig: Log the balancedness each step of expert parallelism. This is turned off by default since it will cause communication overhead. """ + log_balancedness_interval: int = 1 + """ + Interval for logging the balancedness. + """ use_async: bool = False """ Whether to use non-blocking EPLB. diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index c768cc9a0593b..3ee421ed3d1cf 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -549,7 +549,12 @@ class EplbState: for eplb_model_state in self.model_states.values(): eplb_model_state.expert_load_pass.zero_() - if log_stats: + if ( + log_stats + and self.expert_rearrangement_step + % self.parallel_config.eplb_config.log_balancedness_interval + == 0 + ): # Sync the expert load pass for each model (main and drafter). # expert_load_pass: (num_moe_layers, num_physical_experts) expert_load_pass_list = self._sync_load_pass() @@ -581,9 +586,10 @@ class EplbState: if ep_group.rank() == 0: logger.info( - "EPLB step: %d for model %s: avg_tokens=%.2f, " + "EPLB step: %d/%d for model %s: avg_tokens=%.2f, " "max_tokens=%d, balancedness=%.4f", self.expert_rearrangement_step, + self.expert_rearrangement_step_interval, eplb_model_state.model_name, avg_tokens, max_tokens, diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py index aa9f77f3ca5c4..5bc111cf02756 100644 --- a/vllm/distributed/eplb/rebalance_execute.py +++ b/vllm/distributed/eplb/rebalance_execute.py @@ -528,9 +528,6 @@ def rearrange_expert_weights_inplace( # Max number of layers to group for communication max_group_layers = envs.VLLM_EPLB_SYNC_MAX_GROUPED_LAYERS max_group_layers = max(min(max_group_layers, num_moe_layers), 1) - logger.info_once( - f"EPLB Sync: rearrange max_group_layers: {max_group_layers}", scope="global" - ) first_layer_weights = list(expert_weights[0]) # Buffers to hold the expert weights during the exchange. @@ -552,6 +549,9 @@ def rearrange_expert_weights_inplace( group=ep_group, ) return + logger.info_once( + f"EPLB Sync: rearrange max_group_layers: {max_group_layers}", scope="global" + ) # NOTE(bowen): We need this synchronize to run, but I don't know why. # If you figure out the reason, please let me know -- thank you! diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 696ff3a1f4024..8fbfcac7d2cd1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -419,10 +419,10 @@ class EngineArgs: ) _api_process_count: int = ParallelConfig._api_process_count _api_process_rank: int = ParallelConfig._api_process_rank - num_redundant_experts: int = EPLBConfig.num_redundant_experts - eplb_window_size: int = EPLBConfig.window_size - eplb_step_interval: int = EPLBConfig.step_interval - eplb_log_balancedness: bool = EPLBConfig.log_balancedness + num_redundant_experts: int | None = None + eplb_window_size: int | None = None + eplb_step_interval: int | None = None + eplb_log_balancedness: bool | None = None max_parallel_loading_workers: int | None = ( ParallelConfig.max_parallel_loading_workers )