Edit config and fix config post_init

Signed-off-by: ilmarkov <markovilya197@gmail.com>
This commit is contained in:
ilmarkov 2025-11-26 14:11:19 +00:00
parent b8533148ed
commit 30bab971c0
4 changed files with 19 additions and 9 deletions

View File

@ -60,6 +60,10 @@ class EPLBConfig:
Log the balancedness each step of expert parallelism. Log the balancedness each step of expert parallelism.
This is turned off by default since it will cause communication overhead. This is turned off by default since it will cause communication overhead.
""" """
log_balancedness_interval: int = 1
"""
Interval for logging the balancedness.
"""
use_async: bool = False use_async: bool = False
""" """
Whether to use non-blocking EPLB. Whether to use non-blocking EPLB.

View File

@ -549,7 +549,12 @@ class EplbState:
for eplb_model_state in self.model_states.values(): for eplb_model_state in self.model_states.values():
eplb_model_state.expert_load_pass.zero_() eplb_model_state.expert_load_pass.zero_()
if log_stats: if (
log_stats
and self.expert_rearrangement_step
% self.parallel_config.eplb_config.log_balancedness_interval
== 0
):
# Sync the expert load pass for each model (main and drafter). # Sync the expert load pass for each model (main and drafter).
# expert_load_pass: (num_moe_layers, num_physical_experts) # expert_load_pass: (num_moe_layers, num_physical_experts)
expert_load_pass_list = self._sync_load_pass() expert_load_pass_list = self._sync_load_pass()
@ -581,9 +586,10 @@ class EplbState:
if ep_group.rank() == 0: if ep_group.rank() == 0:
logger.info( logger.info(
"EPLB step: %d for model %s: avg_tokens=%.2f, " "EPLB step: %d/%d for model %s: avg_tokens=%.2f, "
"max_tokens=%d, balancedness=%.4f", "max_tokens=%d, balancedness=%.4f",
self.expert_rearrangement_step, self.expert_rearrangement_step,
self.expert_rearrangement_step_interval,
eplb_model_state.model_name, eplb_model_state.model_name,
avg_tokens, avg_tokens,
max_tokens, max_tokens,

View File

@ -528,9 +528,6 @@ def rearrange_expert_weights_inplace(
# Max number of layers to group for communication # Max number of layers to group for communication
max_group_layers = envs.VLLM_EPLB_SYNC_MAX_GROUPED_LAYERS max_group_layers = envs.VLLM_EPLB_SYNC_MAX_GROUPED_LAYERS
max_group_layers = max(min(max_group_layers, num_moe_layers), 1) max_group_layers = max(min(max_group_layers, num_moe_layers), 1)
logger.info_once(
f"EPLB Sync: rearrange max_group_layers: {max_group_layers}", scope="global"
)
first_layer_weights = list(expert_weights[0]) first_layer_weights = list(expert_weights[0])
# Buffers to hold the expert weights during the exchange. # Buffers to hold the expert weights during the exchange.
@ -552,6 +549,9 @@ def rearrange_expert_weights_inplace(
group=ep_group, group=ep_group,
) )
return return
logger.info_once(
f"EPLB Sync: rearrange max_group_layers: {max_group_layers}", scope="global"
)
# NOTE(bowen): We need this synchronize to run, but I don't know why. # NOTE(bowen): We need this synchronize to run, but I don't know why.
# If you figure out the reason, please let me know -- thank you! # If you figure out the reason, please let me know -- thank you!

View File

@ -419,10 +419,10 @@ class EngineArgs:
) )
_api_process_count: int = ParallelConfig._api_process_count _api_process_count: int = ParallelConfig._api_process_count
_api_process_rank: int = ParallelConfig._api_process_rank _api_process_rank: int = ParallelConfig._api_process_rank
num_redundant_experts: int = EPLBConfig.num_redundant_experts num_redundant_experts: int | None = None
eplb_window_size: int = EPLBConfig.window_size eplb_window_size: int | None = None
eplb_step_interval: int = EPLBConfig.step_interval eplb_step_interval: int | None = None
eplb_log_balancedness: bool = EPLBConfig.log_balancedness eplb_log_balancedness: bool | None = None
max_parallel_loading_workers: int | None = ( max_parallel_loading_workers: int | None = (
ParallelConfig.max_parallel_loading_workers ParallelConfig.max_parallel_loading_workers
) )