mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 00:15:51 +08:00
[Doc] Fix batch-level DP example (#23325)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
parent
0c6e40bbaa
commit
5cc54f7c5b
@ -153,13 +153,14 @@ from vllm import LLM
|
|||||||
|
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model="Qwen/Qwen2.5-VL-72B-Instruct",
|
model="Qwen/Qwen2.5-VL-72B-Instruct",
|
||||||
# Create two EngineCore instances, one per DP rank
|
|
||||||
data_parallel_size=2,
|
|
||||||
# Within each EngineCore instance:
|
|
||||||
# The vision encoder uses TP=4 (not DP=2) to shard the input data
|
|
||||||
# The language decoder uses TP=4 to shard the weights as usual
|
|
||||||
tensor_parallel_size=4,
|
tensor_parallel_size=4,
|
||||||
|
# When mm_encoder_tp_mode="data",
|
||||||
|
# the vision encoder uses TP=4 (not DP=1) to shard the input data,
|
||||||
|
# so the TP size becomes the effective DP size.
|
||||||
|
# Note that this is independent of the DP size for language decoder which is used in expert parallel setting.
|
||||||
mm_encoder_tp_mode="data",
|
mm_encoder_tp_mode="data",
|
||||||
|
# The language decoder uses TP=4 to shard the weights regardless
|
||||||
|
# of the setting of mm_encoder_tp_mode
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user