Edit config and fix config post_init

Signed-off-by: ilmarkov <markovilya197@gmail.com>
2026-05-21 10:57:04 +08:00 · 2025-11-26 14:11:19 +00:00 · 2025-11-26 14:11:19 +00:00 · 30bab971c0
commit 30bab971c0
parent b8533148ed
4 changed files with 19 additions and 9 deletions
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@ -60,6 +60,10 @@ class EPLBConfig:
    Log the balancedness each step of expert parallelism.
    This is turned off by default since it will cause communication overhead.
    """
+    log_balancedness_interval: int = 1
+    """
+    Interval for logging the balancedness.
+    """
    use_async: bool = False
    """
    Whether to use non-blocking EPLB.
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@ -549,7 +549,12 @@ class EplbState:
            for eplb_model_state in self.model_states.values():
                eplb_model_state.expert_load_pass.zero_()

-        if log_stats:
+        if (
+            log_stats
+            and self.expert_rearrangement_step
+            % self.parallel_config.eplb_config.log_balancedness_interval
+            == 0
+        ):
            # Sync the expert load pass for each model (main and drafter).
            # expert_load_pass: (num_moe_layers, num_physical_experts)
            expert_load_pass_list = self._sync_load_pass()
@ -581,9 +586,10 @@ class EplbState:

                if ep_group.rank() == 0:
                    logger.info(
-                        "EPLB step: %d for model %s: avg_tokens=%.2f, "
+                        "EPLB step: %d/%d for model %s: avg_tokens=%.2f, "
                        "max_tokens=%d, balancedness=%.4f",
                        self.expert_rearrangement_step,
+                        self.expert_rearrangement_step_interval,
                        eplb_model_state.model_name,
                        avg_tokens,
                        max_tokens,
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@ -528,9 +528,6 @@ def rearrange_expert_weights_inplace(
    # Max number of layers to group for communication
    max_group_layers = envs.VLLM_EPLB_SYNC_MAX_GROUPED_LAYERS
    max_group_layers = max(min(max_group_layers, num_moe_layers), 1)
-    logger.info_once(
-        f"EPLB Sync: rearrange max_group_layers: {max_group_layers}", scope="global"
-    )

    first_layer_weights = list(expert_weights[0])
    # Buffers to hold the expert weights during the exchange.
@ -552,6 +549,9 @@ def rearrange_expert_weights_inplace(
                    group=ep_group,
                )
        return
+    logger.info_once(
+        f"EPLB Sync: rearrange max_group_layers: {max_group_layers}", scope="global"
+    )

    # NOTE(bowen): We need this synchronize to run, but I don't know why.
    # If you figure out the reason, please let me know -- thank you!
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -419,10 +419,10 @@ class EngineArgs:
    )
    _api_process_count: int = ParallelConfig._api_process_count
    _api_process_rank: int = ParallelConfig._api_process_rank
-    num_redundant_experts: int = EPLBConfig.num_redundant_experts
-    eplb_window_size: int = EPLBConfig.window_size
-    eplb_step_interval: int = EPLBConfig.step_interval
-    eplb_log_balancedness: bool = EPLBConfig.log_balancedness
+    num_redundant_experts: int | None = None
+    eplb_window_size: int | None = None
+    eplb_step_interval: int | None = None
+    eplb_log_balancedness: bool | None = None
    max_parallel_loading_workers: int | None = (
        ParallelConfig.max_parallel_loading_workers
    )