From d6945ecdf08faf963b38debaec416c79d9ebd114 Mon Sep 17 00:00:00 2001 From: clark Date: Mon, 20 Jan 2025 23:14:37 +0800 Subject: [PATCH] change disagg_prefill example to use zmq Signed-off-by: clark --- .../online_serving/disaggregated_prefill.sh | 23 +++++++++++-------- vllm/entrypoints/disagg_connector.py | 1 + 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh index 6925dc8af07e9..ef42854f362d1 100644 --- a/examples/online_serving/disaggregated_prefill.sh +++ b/examples/online_serving/disaggregated_prefill.sh @@ -26,14 +26,6 @@ cleanup() { export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') -# install quart first -- required for disagg prefill proxy serve -if python3 -c "import quart" &> /dev/null; then - echo "Quart is already installed." -else - echo "Quart is not installed. Installing..." - python3 -m pip install quart -fi - # a function that waits vLLM server to start wait_for_server() { local port=$1 @@ -49,6 +41,7 @@ wait_for_server() { # prefilling instance, which is the KV producer CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \ --port 8100 \ + --zmq-server-port 7010 \ --max-model-len 100 \ --gpu-memory-utilization 0.8 \ --trust-remote-code \ @@ -58,13 +51,25 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \ # decoding instance, which is the KV consumer CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \ --port 8200 \ + --zmq-server-port 7011 \ --max-model-len 100 \ --gpu-memory-utilization 0.8 \ --trust-remote-code \ --kv-transfer-config \ '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' & -# wait until prefill and decode instances are ready +# launch a proxy server that opens the service at port 8000 +# the workflow of this proxy: +# - send the request to prefill vLLM instance (via zmq port 7010), change max_tokens +# to 1 +# - after the prefill vLLM finishes prefill, send the request to decode vLLM +# instance (via zmq port 7011) +vllm connect --port 8000 \ + --prefill-addr 127.0.0.1:7010 \ + --decode-addr 127.0.0.1:7011 & + +# wait until prefill, decode instances and proxy are ready +wait_for_server 8000 wait_for_server 8100 wait_for_server 8200 diff --git a/vllm/entrypoints/disagg_connector.py b/vllm/entrypoints/disagg_connector.py index 5eaff166cb75a..c6a7d83c7a3ad 100644 --- a/vllm/entrypoints/disagg_connector.py +++ b/vllm/entrypoints/disagg_connector.py @@ -47,6 +47,7 @@ async def lifespan(app: FastAPI): logger.info("success create_socket_pool sockets_decode") yield ## close zmq context + logger.info("shutdown disagg connector") logger.info("term zmqctx") app.state.zmqctx.destroy(linger=0)