mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-05 16:49:06 +08:00
Edit config and fix config post_init
Signed-off-by: ilmarkov <markovilya197@gmail.com>
This commit is contained in:
parent
b8533148ed
commit
30bab971c0
@ -60,6 +60,10 @@ class EPLBConfig:
|
|||||||
Log the balancedness each step of expert parallelism.
|
Log the balancedness each step of expert parallelism.
|
||||||
This is turned off by default since it will cause communication overhead.
|
This is turned off by default since it will cause communication overhead.
|
||||||
"""
|
"""
|
||||||
|
log_balancedness_interval: int = 1
|
||||||
|
"""
|
||||||
|
Interval for logging the balancedness.
|
||||||
|
"""
|
||||||
use_async: bool = False
|
use_async: bool = False
|
||||||
"""
|
"""
|
||||||
Whether to use non-blocking EPLB.
|
Whether to use non-blocking EPLB.
|
||||||
|
|||||||
@ -549,7 +549,12 @@ class EplbState:
|
|||||||
for eplb_model_state in self.model_states.values():
|
for eplb_model_state in self.model_states.values():
|
||||||
eplb_model_state.expert_load_pass.zero_()
|
eplb_model_state.expert_load_pass.zero_()
|
||||||
|
|
||||||
if log_stats:
|
if (
|
||||||
|
log_stats
|
||||||
|
and self.expert_rearrangement_step
|
||||||
|
% self.parallel_config.eplb_config.log_balancedness_interval
|
||||||
|
== 0
|
||||||
|
):
|
||||||
# Sync the expert load pass for each model (main and drafter).
|
# Sync the expert load pass for each model (main and drafter).
|
||||||
# expert_load_pass: (num_moe_layers, num_physical_experts)
|
# expert_load_pass: (num_moe_layers, num_physical_experts)
|
||||||
expert_load_pass_list = self._sync_load_pass()
|
expert_load_pass_list = self._sync_load_pass()
|
||||||
@ -581,9 +586,10 @@ class EplbState:
|
|||||||
|
|
||||||
if ep_group.rank() == 0:
|
if ep_group.rank() == 0:
|
||||||
logger.info(
|
logger.info(
|
||||||
"EPLB step: %d for model %s: avg_tokens=%.2f, "
|
"EPLB step: %d/%d for model %s: avg_tokens=%.2f, "
|
||||||
"max_tokens=%d, balancedness=%.4f",
|
"max_tokens=%d, balancedness=%.4f",
|
||||||
self.expert_rearrangement_step,
|
self.expert_rearrangement_step,
|
||||||
|
self.expert_rearrangement_step_interval,
|
||||||
eplb_model_state.model_name,
|
eplb_model_state.model_name,
|
||||||
avg_tokens,
|
avg_tokens,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
|
|||||||
@ -528,9 +528,6 @@ def rearrange_expert_weights_inplace(
|
|||||||
# Max number of layers to group for communication
|
# Max number of layers to group for communication
|
||||||
max_group_layers = envs.VLLM_EPLB_SYNC_MAX_GROUPED_LAYERS
|
max_group_layers = envs.VLLM_EPLB_SYNC_MAX_GROUPED_LAYERS
|
||||||
max_group_layers = max(min(max_group_layers, num_moe_layers), 1)
|
max_group_layers = max(min(max_group_layers, num_moe_layers), 1)
|
||||||
logger.info_once(
|
|
||||||
f"EPLB Sync: rearrange max_group_layers: {max_group_layers}", scope="global"
|
|
||||||
)
|
|
||||||
|
|
||||||
first_layer_weights = list(expert_weights[0])
|
first_layer_weights = list(expert_weights[0])
|
||||||
# Buffers to hold the expert weights during the exchange.
|
# Buffers to hold the expert weights during the exchange.
|
||||||
@ -552,6 +549,9 @@ def rearrange_expert_weights_inplace(
|
|||||||
group=ep_group,
|
group=ep_group,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
logger.info_once(
|
||||||
|
f"EPLB Sync: rearrange max_group_layers: {max_group_layers}", scope="global"
|
||||||
|
)
|
||||||
|
|
||||||
# NOTE(bowen): We need this synchronize to run, but I don't know why.
|
# NOTE(bowen): We need this synchronize to run, but I don't know why.
|
||||||
# If you figure out the reason, please let me know -- thank you!
|
# If you figure out the reason, please let me know -- thank you!
|
||||||
|
|||||||
@ -419,10 +419,10 @@ class EngineArgs:
|
|||||||
)
|
)
|
||||||
_api_process_count: int = ParallelConfig._api_process_count
|
_api_process_count: int = ParallelConfig._api_process_count
|
||||||
_api_process_rank: int = ParallelConfig._api_process_rank
|
_api_process_rank: int = ParallelConfig._api_process_rank
|
||||||
num_redundant_experts: int = EPLBConfig.num_redundant_experts
|
num_redundant_experts: int | None = None
|
||||||
eplb_window_size: int = EPLBConfig.window_size
|
eplb_window_size: int | None = None
|
||||||
eplb_step_interval: int = EPLBConfig.step_interval
|
eplb_step_interval: int | None = None
|
||||||
eplb_log_balancedness: bool = EPLBConfig.log_balancedness
|
eplb_log_balancedness: bool | None = None
|
||||||
max_parallel_loading_workers: int | None = (
|
max_parallel_loading_workers: int | None = (
|
||||||
ParallelConfig.max_parallel_loading_workers
|
ParallelConfig.max_parallel_loading_workers
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user