[BugFix] Fix torchrun DP with LLM class (#27395)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
This commit is contained in:
22quinn 2025-10-24 01:11:37 -07:00 committed by GitHub
parent 42efe609ba
commit e0ef8a2920
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -286,10 +286,11 @@ class LLM:
structured_outputs_instance = StructuredOutputsConfig() structured_outputs_instance = StructuredOutputsConfig()
# warn about single-process data parallel usage. # warn about single-process data parallel usage.
_dps = int(kwargs.get("data_parallel_size", 1)) _dp_size = int(kwargs.get("data_parallel_size", 1))
if _dps > 1: _distributed_executor_backend = kwargs.get("distributed_executor_backend")
if _dp_size > 1 and not _distributed_executor_backend == "external_launcher":
raise ValueError( raise ValueError(
f"LLM(data_parallel_size={_dps}) is not supported for single-" f"LLM(data_parallel_size={_dp_size}) is not supported for single-"
"process usage and may hang. Please use " "process usage and may hang. Please use "
"the explicit multi-process data-parallel example at " "the explicit multi-process data-parallel example at "
"'examples/offline_inference/data_parallel.py'." "'examples/offline_inference/data_parallel.py'."