Signed-off-by: ilmarkov <markovilya197@gmail.com>
This commit is contained in:
ilmarkov 2025-12-15 17:16:40 +00:00
parent 7ebd46fe76
commit c761ce527a
2 changed files with 8 additions and 2 deletions

View File

@ -78,8 +78,8 @@ def run_rebalance_experts(
eplb_stats.num_gpus,
model_state.physical_to_logical_map,
)
assert new_physical_to_logical_map.device == torch.device("cpu")
# Move map to cpu
model_state.new_physical_to_logical_map = new_physical_to_logical_map
max_slots = model_state.logical_to_physical_map.shape[-1]
@ -109,6 +109,10 @@ async def transfer_run_periodically(
continue
if not model_state.new_indices_computed:
run_rebalance_experts(model_state, state)
logger.info(
"Async worker computed new indices for model %s",
model_state.model_name,
)
current_num_layers = model_state.model.num_moe_layers
while (

View File

@ -917,7 +917,9 @@ class EplbState:
)
else:
eplb_model_state.eplb_stats = EplbStats(
global_expert_load_window=global_expert_load_window,
# We copy the tensor to snapshot the workload on the main
# thread to be used on the async thread.
global_expert_load_window=global_expert_load_window.clone(),
num_replicas=num_replicas,
num_groups=num_groups,
num_nodes=num_nodes,