From 5cc54f7c5bf2516b909fadb0ad09b68e65c42812 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 21 Aug 2025 21:16:38 +0800 Subject: [PATCH] [Doc] Fix batch-level DP example (#23325) Signed-off-by: DarkLight1337 Signed-off-by: Cyrus Leung Co-authored-by: youkaichao --- docs/configuration/optimization.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index db9dfb313fb87..357a5eb594060 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -153,13 +153,14 @@ from vllm import LLM llm = LLM( model="Qwen/Qwen2.5-VL-72B-Instruct", - # Create two EngineCore instances, one per DP rank - data_parallel_size=2, - # Within each EngineCore instance: - # The vision encoder uses TP=4 (not DP=2) to shard the input data - # The language decoder uses TP=4 to shard the weights as usual tensor_parallel_size=4, + # When mm_encoder_tp_mode="data", + # the vision encoder uses TP=4 (not DP=1) to shard the input data, + # so the TP size becomes the effective DP size. + # Note that this is independent of the DP size for language decoder which is used in expert parallel setting. mm_encoder_tp_mode="data", + # The language decoder uses TP=4 to shard the weights regardless + # of the setting of mm_encoder_tp_mode ) ```