mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 04:24:58 +08:00
Remove deprecated PyNcclConnector (#24151)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
This commit is contained in:
parent
36c260dad6
commit
b5ee1e3261
@ -62,7 +62,7 @@ benchmark() {
|
||||
--max-model-len 10000 \
|
||||
--gpu-memory-utilization 0.6 \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||
|
||||
|
||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||
@ -72,7 +72,7 @@ benchmark() {
|
||||
--max-model-len 10000 \
|
||||
--gpu-memory-utilization 0.6 \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||
|
||||
wait_for_server 8100
|
||||
wait_for_server 8200
|
||||
|
||||
@ -69,7 +69,7 @@ launch_disagg_prefill() {
|
||||
--max-model-len 10000 \
|
||||
--gpu-memory-utilization 0.6 \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||
|
||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
||||
-m vllm.entrypoints.openai.api_server \
|
||||
@ -78,7 +78,7 @@ launch_disagg_prefill() {
|
||||
--max-model-len 10000 \
|
||||
--gpu-memory-utilization 0.6 \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||
|
||||
wait_for_server 8100
|
||||
wait_for_server 8200
|
||||
|
||||
@ -30,12 +30,12 @@ def run_prefill(prefill_done):
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
|
||||
|
||||
# Using PyNcclConnector to transmit KV caches between vLLM instances.
|
||||
# Using P2pNcclConnector to transmit KV caches between vLLM instances.
|
||||
# This instance is the prefill node (kv_producer, rank 0).
|
||||
# The number of parallel instances for KV cache transfer is set to 2,
|
||||
# as required for PyNcclConnector.
|
||||
# as required for P2pNcclConnector.
|
||||
ktc = KVTransferConfig(
|
||||
kv_connector="PyNcclConnector",
|
||||
kv_connector="P2pNcclConnector",
|
||||
kv_role="kv_producer",
|
||||
kv_rank=0,
|
||||
kv_parallel_size=2,
|
||||
@ -74,12 +74,12 @@ def run_decode(prefill_done):
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95)
|
||||
|
||||
# Using PyNcclConnector to transmit KV caches between vLLM instances.
|
||||
# Using P2pNcclConnector to transmit KV caches between vLLM instances.
|
||||
# This instance is the decode node (kv_consumer, rank 1).
|
||||
# The number of parallel instances for KV cache transfer is set to 2,
|
||||
# as required for PyNcclConnector.
|
||||
# as required for P2pNcclConnector.
|
||||
ktc = KVTransferConfig(
|
||||
kv_connector="PyNcclConnector",
|
||||
kv_connector="P2pNcclConnector",
|
||||
kv_role="kv_consumer",
|
||||
kv_rank=1,
|
||||
kv_parallel_size=2,
|
||||
|
||||
@ -53,7 +53,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--trust-remote-code \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
|
||||
|
||||
# decoding instance, which is the KV consumer
|
||||
CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
|
||||
@ -62,7 +62,7 @@ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--trust-remote-code \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
|
||||
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
|
||||
|
||||
# wait until prefill and decode instances are ready
|
||||
wait_for_server 8100
|
||||
|
||||
@ -128,7 +128,7 @@ if __name__ == "__main__":
|
||||
print(f"initialized! My rank is {my_rank}")
|
||||
|
||||
config = KVTransferConfig(
|
||||
kv_connector='PyNcclConnector',
|
||||
kv_connector='P2pNcclConnector',
|
||||
kv_buffer_device='cuda',
|
||||
kv_buffer_size=1e9,
|
||||
kv_rank=my_rank,
|
||||
|
||||
@ -137,7 +137,7 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
config = KVTransferConfig(
|
||||
kv_connector='PyNcclConnector',
|
||||
kv_connector='P2pNcclConnector',
|
||||
kv_buffer_device='cuda',
|
||||
kv_buffer_size=1e9,
|
||||
kv_rank=my_rank,
|
||||
|
||||
@ -3247,7 +3247,7 @@ class KVTransferConfig:
|
||||
|
||||
kv_parallel_size: int = 1
|
||||
"""The number of parallel instances for KV cache transfer. For
|
||||
PyNcclConnector, this should be 2."""
|
||||
P2pNcclConnector, this should be 2."""
|
||||
|
||||
kv_ip: str = "127.0.0.1"
|
||||
"""The KV connector ip, used to build distributed connection."""
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user