From e6f114ac25967b073954f7f3dc733672d173124c Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Tue, 2 Dec 2025 14:20:22 -0800 Subject: [PATCH] [Bugfix][EPLB] Prevent user-provided EPLB config from being overwritten with defaults (#29911) Signed-off-by: Sage Moore --- tests/distributed/test_eplb_spec_decode.py | 16 +++++++++------- vllm/engine/arg_utils.py | 14 -------------- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py index c055b7a3f6dd7..868cc702866e2 100644 --- a/tests/distributed/test_eplb_spec_decode.py +++ b/tests/distributed/test_eplb_spec_decode.py @@ -22,7 +22,14 @@ def get_model_args( "num_speculative_tokens": 1, "max_model_len": model_max_len, } - + eplb_config = { + "num_redundant_experts": tp_size, + "window_size": 128, + "step_interval": 1024, + "log_balancedness": False, + } + if use_async: + eplb_config["use_async"] = True model_args = { "pretrained": model_name, "dtype": "auto", @@ -31,15 +38,10 @@ def get_model_args( "gpu_memory_utilization": 0.7, "speculative_config": speculative_config, "enable_expert_parallel": True, - "num_redundant_experts": tp_size, - "eplb_window_size": 128, - "eplb_step_interval": 1024, - "eplb_log_balancedness": False, + "eplb_config": eplb_config, "enable_eplb": True, "max_model_len": model_max_len, } - if use_async: - model_args["eplb_config"] = {"use_async": True} return model_args diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 83029e09ceaad..096217da4fe44 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -421,10 +421,6 @@ class EngineArgs: ) _api_process_count: int = ParallelConfig._api_process_count _api_process_rank: int = ParallelConfig._api_process_rank - num_redundant_experts: int = EPLBConfig.num_redundant_experts - eplb_window_size: int = EPLBConfig.window_size - eplb_step_interval: int = EPLBConfig.step_interval - eplb_log_balancedness: bool = EPLBConfig.log_balancedness max_parallel_loading_workers: int | None = ( ParallelConfig.max_parallel_loading_workers ) @@ -1582,16 +1578,6 @@ class EngineArgs: ) self.disable_nccl_for_dp_synchronization = True - # Forward the deprecated CLI args to the EPLB config. - if self.num_redundant_experts is not None: - self.eplb_config.num_redundant_experts = self.num_redundant_experts - if self.eplb_window_size is not None: - self.eplb_config.window_size = self.eplb_window_size - if self.eplb_step_interval is not None: - self.eplb_config.step_interval = self.eplb_step_interval - if self.eplb_log_balancedness is not None: - self.eplb_config.log_balancedness = self.eplb_log_balancedness - parallel_config = ParallelConfig( pipeline_parallel_size=self.pipeline_parallel_size, tensor_parallel_size=self.tensor_parallel_size,