diff --git a/vllm/config.py b/vllm/config.py index 04db21766c657..a72fa42f3c332 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1892,16 +1892,15 @@ class ParallelConfig: data_parallel_external_lb: bool = False """Whether to use "external" DP LB mode. Applies only to online serving and when data_parallel_size > 0. This is useful for a "one-pod-per-rank" - wide-EP setup in Kuberentes. Set implicitly when data_parallel_rank + wide-EP setup in Kuberentes. Set implicitly when --data-parallel-rank is provided explicitly to vllm serve.""" data_parallel_hybrid_lb: bool = False """Whether to use "hybrid" DP LB mode. Applies only to online serving and when data_parallel_size > 0. Enables running an AsyncLLM and API server on a "per-node" basis where vLLM load balances between local data parallel ranks, but an external LB balances - between vLLM nodes/replicas. This is useful for a "one-pod-per-node" - wide-EP setup in Kuberentes. Set explicitly by the user. - """ + between vLLM nodes/replicas. Set explicitly in conjunction with + --data-parallel-start-rank.""" enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" enable_eplb: bool = False diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3a542ff78e805..703f5c0b20f4b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1097,32 +1097,17 @@ class EngineArgs: # but we should not do this here. placement_group = ray.util.get_current_placement_group() - # Organize --data-parallel-start-rank and --data-parallel-rank. - if self.data_parallel_start_rank is not None: - if self.data_parallel_rank is not None: - raise ValueError( - "Found --data-parallel-rank and --data-parallel-start-rank." - "Only one should be set (use --data-parallel-start-rank).") - else: - self.data_parallel_rank = self.data_parallel_start_rank - - # Validate External LB. - data_parallel_hybrid_lb = True - - if data_parallel_hybrid_lb: - if self.data_parallel_size_local is None: - raise ValueError( - "With external LB, --data-parallel-size-local must be set." - ) - if self.data_parallel_size_local >= self.data_parallel_size: - raise ValueError( - "With external LB, --data-parallel-size-local must be " - "less than --data-parallel-size.") + data_parallel_external_lb = self.data_parallel_rank is not None + if data_parallel_external_lb: + assert self.data_parallel_size_local in (1, None), ( + "data_parallel_size_local must be 1 when data_parallel_rank " + "is set") + data_parallel_size_local = 1 + elif self.data_parallel_size_local is not None: data_parallel_size_local = self.data_parallel_size_local - - # Local DP size defaults to global DP size if not set. - data_parallel_size_local = (self.data_parallel_size_local - or self.data_parallel_size) + else: + # Local DP size defaults to global DP size if not set. + data_parallel_size_local = self.data_parallel_size # DP address, used in multi-node case for torch distributed group # and ZMQ sockets. @@ -1177,7 +1162,7 @@ class EngineArgs: data_parallel_master_ip=data_parallel_address, data_parallel_rpc_port=data_parallel_rpc_port, data_parallel_backend=self.data_parallel_backend, - data_parallel_hybrid_lb=False, + data_parallel_hybrid_lb=True, enable_expert_parallel=self.enable_expert_parallel, enable_eplb=self.enable_eplb, num_redundant_experts=self.num_redundant_experts,