mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-06 04:57:04 +08:00
Edit config and fix config post_init
Signed-off-by: ilmarkov <markovilya197@gmail.com>
This commit is contained in:
parent
b8533148ed
commit
30bab971c0
@ -60,6 +60,10 @@ class EPLBConfig:
|
||||
Log the balancedness each step of expert parallelism.
|
||||
This is turned off by default since it will cause communication overhead.
|
||||
"""
|
||||
log_balancedness_interval: int = 1
|
||||
"""
|
||||
Interval for logging the balancedness.
|
||||
"""
|
||||
use_async: bool = False
|
||||
"""
|
||||
Whether to use non-blocking EPLB.
|
||||
|
||||
@ -549,7 +549,12 @@ class EplbState:
|
||||
for eplb_model_state in self.model_states.values():
|
||||
eplb_model_state.expert_load_pass.zero_()
|
||||
|
||||
if log_stats:
|
||||
if (
|
||||
log_stats
|
||||
and self.expert_rearrangement_step
|
||||
% self.parallel_config.eplb_config.log_balancedness_interval
|
||||
== 0
|
||||
):
|
||||
# Sync the expert load pass for each model (main and drafter).
|
||||
# expert_load_pass: (num_moe_layers, num_physical_experts)
|
||||
expert_load_pass_list = self._sync_load_pass()
|
||||
@ -581,9 +586,10 @@ class EplbState:
|
||||
|
||||
if ep_group.rank() == 0:
|
||||
logger.info(
|
||||
"EPLB step: %d for model %s: avg_tokens=%.2f, "
|
||||
"EPLB step: %d/%d for model %s: avg_tokens=%.2f, "
|
||||
"max_tokens=%d, balancedness=%.4f",
|
||||
self.expert_rearrangement_step,
|
||||
self.expert_rearrangement_step_interval,
|
||||
eplb_model_state.model_name,
|
||||
avg_tokens,
|
||||
max_tokens,
|
||||
|
||||
@ -528,9 +528,6 @@ def rearrange_expert_weights_inplace(
|
||||
# Max number of layers to group for communication
|
||||
max_group_layers = envs.VLLM_EPLB_SYNC_MAX_GROUPED_LAYERS
|
||||
max_group_layers = max(min(max_group_layers, num_moe_layers), 1)
|
||||
logger.info_once(
|
||||
f"EPLB Sync: rearrange max_group_layers: {max_group_layers}", scope="global"
|
||||
)
|
||||
|
||||
first_layer_weights = list(expert_weights[0])
|
||||
# Buffers to hold the expert weights during the exchange.
|
||||
@ -552,6 +549,9 @@ def rearrange_expert_weights_inplace(
|
||||
group=ep_group,
|
||||
)
|
||||
return
|
||||
logger.info_once(
|
||||
f"EPLB Sync: rearrange max_group_layers: {max_group_layers}", scope="global"
|
||||
)
|
||||
|
||||
# NOTE(bowen): We need this synchronize to run, but I don't know why.
|
||||
# If you figure out the reason, please let me know -- thank you!
|
||||
|
||||
@ -419,10 +419,10 @@ class EngineArgs:
|
||||
)
|
||||
_api_process_count: int = ParallelConfig._api_process_count
|
||||
_api_process_rank: int = ParallelConfig._api_process_rank
|
||||
num_redundant_experts: int = EPLBConfig.num_redundant_experts
|
||||
eplb_window_size: int = EPLBConfig.window_size
|
||||
eplb_step_interval: int = EPLBConfig.step_interval
|
||||
eplb_log_balancedness: bool = EPLBConfig.log_balancedness
|
||||
num_redundant_experts: int | None = None
|
||||
eplb_window_size: int | None = None
|
||||
eplb_step_interval: int | None = None
|
||||
eplb_log_balancedness: bool | None = None
|
||||
max_parallel_loading_workers: int | None = (
|
||||
ParallelConfig.max_parallel_loading_workers
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user