Remove deprecated PyNcclConnector (#24151)

Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
This commit is contained in:
Peter Pan 2025-09-04 06:49:16 +08:00 committed by GitHub
parent 36c260dad6
commit b5ee1e3261
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 15 additions and 15 deletions

View File

@ -62,7 +62,7 @@ benchmark() {
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \ CUDA_VISIBLE_DEVICES=1 python3 \
@ -72,7 +72,7 @@ benchmark() {
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
wait_for_server 8100 wait_for_server 8100
wait_for_server 8200 wait_for_server 8200

View File

@ -69,7 +69,7 @@ launch_disagg_prefill() {
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \ CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \ -m vllm.entrypoints.openai.api_server \
@ -78,7 +78,7 @@ launch_disagg_prefill() {
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
wait_for_server 8100 wait_for_server 8100
wait_for_server 8200 wait_for_server 8200

View File

@ -30,12 +30,12 @@ def run_prefill(prefill_done):
] ]
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
# Using PyNcclConnector to transmit KV caches between vLLM instances. # Using P2pNcclConnector to transmit KV caches between vLLM instances.
# This instance is the prefill node (kv_producer, rank 0). # This instance is the prefill node (kv_producer, rank 0).
# The number of parallel instances for KV cache transfer is set to 2, # The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector. # as required for P2pNcclConnector.
ktc = KVTransferConfig( ktc = KVTransferConfig(
kv_connector="PyNcclConnector", kv_connector="P2pNcclConnector",
kv_role="kv_producer", kv_role="kv_producer",
kv_rank=0, kv_rank=0,
kv_parallel_size=2, kv_parallel_size=2,
@ -74,12 +74,12 @@ def run_decode(prefill_done):
] ]
sampling_params = SamplingParams(temperature=0, top_p=0.95) sampling_params = SamplingParams(temperature=0, top_p=0.95)
# Using PyNcclConnector to transmit KV caches between vLLM instances. # Using P2pNcclConnector to transmit KV caches between vLLM instances.
# This instance is the decode node (kv_consumer, rank 1). # This instance is the decode node (kv_consumer, rank 1).
# The number of parallel instances for KV cache transfer is set to 2, # The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector. # as required for P2pNcclConnector.
ktc = KVTransferConfig( ktc = KVTransferConfig(
kv_connector="PyNcclConnector", kv_connector="P2pNcclConnector",
kv_role="kv_consumer", kv_role="kv_consumer",
kv_rank=1, kv_rank=1,
kv_parallel_size=2, kv_parallel_size=2,

View File

@ -53,7 +53,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
--gpu-memory-utilization 0.8 \ --gpu-memory-utilization 0.8 \
--trust-remote-code \ --trust-remote-code \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
# decoding instance, which is the KV consumer # decoding instance, which is the KV consumer
CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
@ -62,7 +62,7 @@ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
--gpu-memory-utilization 0.8 \ --gpu-memory-utilization 0.8 \
--trust-remote-code \ --trust-remote-code \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
# wait until prefill and decode instances are ready # wait until prefill and decode instances are ready
wait_for_server 8100 wait_for_server 8100

View File

@ -128,7 +128,7 @@ if __name__ == "__main__":
print(f"initialized! My rank is {my_rank}") print(f"initialized! My rank is {my_rank}")
config = KVTransferConfig( config = KVTransferConfig(
kv_connector='PyNcclConnector', kv_connector='P2pNcclConnector',
kv_buffer_device='cuda', kv_buffer_device='cuda',
kv_buffer_size=1e9, kv_buffer_size=1e9,
kv_rank=my_rank, kv_rank=my_rank,

View File

@ -137,7 +137,7 @@ if __name__ == "__main__":
) )
config = KVTransferConfig( config = KVTransferConfig(
kv_connector='PyNcclConnector', kv_connector='P2pNcclConnector',
kv_buffer_device='cuda', kv_buffer_device='cuda',
kv_buffer_size=1e9, kv_buffer_size=1e9,
kv_rank=my_rank, kv_rank=my_rank,

View File

@ -3247,7 +3247,7 @@ class KVTransferConfig:
kv_parallel_size: int = 1 kv_parallel_size: int = 1
"""The number of parallel instances for KV cache transfer. For """The number of parallel instances for KV cache transfer. For
PyNcclConnector, this should be 2.""" P2pNcclConnector, this should be 2."""
kv_ip: str = "127.0.0.1" kv_ip: str = "127.0.0.1"
"""The KV connector ip, used to build distributed connection.""" """The KV connector ip, used to build distributed connection."""