diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index db9dfb313fb8..357a5eb59406 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -153,13 +153,14 @@ from vllm import LLM llm = LLM( model="Qwen/Qwen2.5-VL-72B-Instruct", - # Create two EngineCore instances, one per DP rank - data_parallel_size=2, - # Within each EngineCore instance: - # The vision encoder uses TP=4 (not DP=2) to shard the input data - # The language decoder uses TP=4 to shard the weights as usual tensor_parallel_size=4, + # When mm_encoder_tp_mode="data", + # the vision encoder uses TP=4 (not DP=1) to shard the input data, + # so the TP size becomes the effective DP size. + # Note that this is independent of the DP size for language decoder which is used in expert parallel setting. mm_encoder_tp_mode="data", + # The language decoder uses TP=4 to shard the weights regardless + # of the setting of mm_encoder_tp_mode ) ```