vllm/examples/online_serving/disaggregated_prefill_zmq.sh

#!/bin/bash
# This file demonstrates the example usage of disaggregated prefilling with ZMQ
# We will launch 2 vllm instances (1 for prefill and 1 for decode),
# and then transfer the KV cache between them.

set -xe

echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
sleep 1

# Trap the SIGINT signal (triggered by Ctrl+C)
trap 'cleanup' INT

# Cleanup function
cleanup() {
    echo "Caught Ctrl+C, cleaning up..."
    # Cleanup commands
    pgrep python | xargs kill -9
    pkill -f python
    echo "Cleanup complete. Exiting."
    exit 0
}

export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')

# a function that waits vLLM connect to start
wait_for_server() {
  local port=$1
  timeout 1200 bash -c "
    until curl -s localhost:${port}/v1/completions > /dev/null; do
      sleep 1
    done" && return 0 || return 1
}


# a function that waits vLLM disagg to start
wait_for_disagg_server() {
  local log_file=$1
  timeout 1200 bash -c "
    until grep -q 'zmq Server started at' $log_file; do
      sleep 1
    done" && return 0 || return 1
}


# You can also adjust --kv-ip and --kv-port for distributed inference.
MODEL=meta-llama/Llama-3.1-8B-Instruct
CONNECTOR_ADDR=connectoripc
PREFILL_WORKER_ADDR=prefillipc
DECODE_WORKER_ADDR=decodeipc

# prefilling instance, which is the KV producer
CUDA_VISIBLE_DEVICES=0 python3 ../vllm/entrypoints/disaggregated/worker.py \
    --model $MODEL \
    --connector-addr $CONNECTOR_ADDR \
    --worker-addr $PREFILL_WORKER_ADDR \
    --max-model-len 100 \
    --gpu-memory-utilization 0.8 \
    --kv-transfer-config \
    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' > vllm_disagg_prefill.log 2>&1 &

# decoding instance, which is the KV consumer
CUDA_VISIBLE_DEVICES=1 python3 ../vllm/entrypoints/disaggregated/worker.py \
    --model $MODEL \
    --connector-addr $CONNECTOR_ADDR \
    --worker-addr $DECODE_WORKER_ADDR \
    --max-model-len 100 \
    --gpu-memory-utilization 0.8 \
    --kv-transfer-config \
    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' > vllm_disagg_decode.log 2>&1 &

# launch a proxy server that opens the service at port 8000
# the workflow of this proxy:
# - Send req to prefill instance, wait until complete.
# - Send req to decode instance, streaming tokens.
python3 ../vllm/entrypoints/disaggregated/connector.py \
    --port $PORT \
    --model $MODEL \
    --connector-addr $CONNECTOR_ADDR \
    --prefill-addr $PREFILL_WORKER_ADDR \
    --decode-addr $DECODE_WORKER_ADDR

# wait until prefill, decode instances and proxy are ready
wait_for_server 8001
wait_for_disagg_server vllm_disagg_prefill.log
wait_for_disagg_server vllm_disagg_decode.log

# serve two example requests
output1=$(curl -X POST -s http://localhost:8001/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "meta-llama/Llama-3.1-8B-Instruct",
"prompt": "San Francisco is a",
"max_tokens": 10,
"temperature": 0
}')

output2=$(curl -X POST -s http://localhost:8001/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"prompt": "Santa Clara is a",
"max_tokens": 10,
"temperature": 0
}')


# Cleanup commands
pgrep python | xargs kill -9
pkill -f python

echo ""

sleep 1

# Print the outputs of the curl requests
echo ""
echo "Output of first request: $output1"
echo "Output of second request: $output2"

echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉"
echo ""