Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
This commit is contained in:
rshaw@neuralmagic.com 2025-03-24 01:00:20 +00:00
parent 70e06dd574
commit 220d694080

View File

@ -37,7 +37,7 @@ wait_for_server() {
wait_for_disagg_server() { wait_for_disagg_server() {
local log_file=$1 local log_file=$1
timeout 1200 bash -c " timeout 1200 bash -c "
until grep -q 'PD Worker is ready' $log_file; do until grep -q 'PDWorker is ready' $log_file; do
sleep 1 sleep 1
done" && return 0 || return 1 done" && return 0 || return 1
} }
@ -45,9 +45,10 @@ wait_for_disagg_server() {
# You can also adjust --kv-ip and --kv-port for distributed inference. # You can also adjust --kv-ip and --kv-port for distributed inference.
MODEL=meta-llama/Llama-3.1-8B-Instruct MODEL=meta-llama/Llama-3.1-8B-Instruct
CONTROLLER_ADDR=controlleripc CONTROLLER_ADDR=controller.ipc
PREFILL_WORKER_ADDR=prefillipc PREFILL_WORKER_ADDR=prefill.ipc
DECODE_WORKER_ADDR=decodeipc DECODE_WORKER_ADDR=decode.ipc
PORT=8001
# prefilling instance, which is the KV producer # prefilling instance, which is the KV producer
CUDA_VISIBLE_DEVICES=0 python3 ../../vllm/entrypoints/disaggregated/worker.py \ CUDA_VISIBLE_DEVICES=0 python3 ../../vllm/entrypoints/disaggregated/worker.py \
@ -78,10 +79,10 @@ python3 ../../vllm/entrypoints/disaggregated/api_server.py \
--model $MODEL \ --model $MODEL \
--controller-addr $CONTROLLER_ADDR \ --controller-addr $CONTROLLER_ADDR \
--prefill-addr $PREFILL_WORKER_ADDR \ --prefill-addr $PREFILL_WORKER_ADDR \
--decode-addr $DECODE_WORKER_ADDR --decode-addr $DECODE_WORKER_ADDR &
# wait until prefill, decode instances and proxy are ready # wait until prefill, decode instances and proxy are ready
wait_for_server 8001 wait_for_server $PORT
wait_for_disagg_server vllm_disagg_prefill.log wait_for_disagg_server vllm_disagg_prefill.log
wait_for_disagg_server vllm_disagg_decode.log wait_for_disagg_server vllm_disagg_decode.log
@ -98,7 +99,7 @@ output1=$(curl -X POST -s http://localhost:8001/v1/completions \
output2=$(curl -X POST -s http://localhost:8001/v1/completions \ output2=$(curl -X POST -s http://localhost:8001/v1/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "model": "meta-llama/Llama-3.1-8B-Instruct",
"prompt": "Santa Clara is a", "prompt": "Santa Clara is a",
"max_tokens": 10, "max_tokens": 10,
"temperature": 0 "temperature": 0