Better error when world size is larger than node and distributed_executor_backend is not set (#30140)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-12-06 04:53:52 +00:00 committed by GitHub
parent 7e31c3a3f6
commit bf4a901af9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -184,13 +184,14 @@ class ParallelConfig:
distributed_executor_backend: ( distributed_executor_backend: (
str | DistributedExecutorBackend | type[Executor] | None str | DistributedExecutorBackend | type[Executor] | None
) = None ) = None
"""Backend to use for distributed model """Backend to use for distributed model workers, either "ray" or "mp"
workers, either "ray" or "mp" (multiprocessing). If the product (multiprocessing). If the product of pipeline_parallel_size and tensor_parallel_size
of pipeline_parallel_size and tensor_parallel_size is less than is less than or equal to the number of GPUs available, "mp" will be used to
or equal to the number of GPUs available, "mp" will be used to keep processing on a single host. Otherwise, an error will be raised. To use "mp"
keep processing on a single host. Otherwise, this will default you must also set nnodes, and to use "ray" you must manually set
to "ray" if Ray is installed and fail otherwise. Note that tpu distributed_executor_backend to "ray".
only support Ray for distributed inference."""
Note that tpu only support Ray for distributed inference."""
worker_cls: str = "auto" worker_cls: str = "auto"
"""The full name of the worker class to use. If "auto", the worker class """The full name of the worker class to use. If "auto", the worker class
@ -566,8 +567,11 @@ class ParallelConfig:
): ):
gpu_count = cuda_device_count_stateless() gpu_count = cuda_device_count_stateless()
raise ValueError( raise ValueError(
f"Tensor parallel size ({self.world_size}) cannot be " f"World size ({self.world_size}) is larger than the number of "
f"larger than the number of available GPUs ({gpu_count})." f"available GPUs ({gpu_count}) in this node. If this is "
"intentional and you are using:\n"
"- ray, set '--distributed-executor-backend ray'.\n"
"- multiprocessing, set '--nnodes' appropriately."
) )
elif self.data_parallel_backend == "ray": elif self.data_parallel_backend == "ray":
logger.info( logger.info(