From 5cc54f7c5bf2516b909fadb0ad09b68e65c42812 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 21 Aug 2025 21:16:38 +0800
Subject: [PATCH] [Doc] Fix batch-level DP example (#23325)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 docs/configuration/optimization.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index db9dfb313fb87..357a5eb594060 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -153,13 +153,14 @@ from vllm import LLM
 
 llm = LLM(
     model="Qwen/Qwen2.5-VL-72B-Instruct",
-    # Create two EngineCore instances, one per DP rank
-    data_parallel_size=2,
-    # Within each EngineCore instance:
-    # The vision encoder uses TP=4 (not DP=2) to shard the input data
-    # The language decoder uses TP=4 to shard the weights as usual
     tensor_parallel_size=4,
+    # When mm_encoder_tp_mode="data",
+    # the vision encoder uses TP=4 (not DP=1) to shard the input data,
+    # so the TP size becomes the effective DP size.
+    # Note that this is independent of the DP size for language decoder which is used in expert parallel setting.
     mm_encoder_tp_mode="data",
+    # The language decoder uses TP=4 to shard the weights regardless
+    # of the setting of mm_encoder_tp_mode
 )
 ```