vllm/tools/Justfile
Robert Shaw bd57841c7b updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-07-07 01:14:10 +00:00

80 lines
2.5 KiB
Makefile

# Needed for the proxy server
vllm-directory := "/home/rshaw/vllm/"
# MODEL := "Qwen/Qwen3-0.6B"
MODEL := "meta-llama/Llama-3.1-8B-Instruct"
PROXY_PORT := "8192"
PREFILL_PORT := "8100"
DECODE_PORT := "8200"
prefill:
VLLM_NIXL_SIDE_CHANNEL_PORT=5557 \
CUDA_VISIBLE_DEVICES=0,7 \
vllm serve {{MODEL}} \
--port {{PREFILL_PORT}} \
--tensor-parallel-size 2 \
--enforce-eager \
--disable-log-requests \
--block-size 128 \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
decode:
VLLM_NIXL_SIDE_CHANNEL_PORT=5567 \
CUDA_VISIBLE_DEVICES=4,5 \
vllm serve {{MODEL}} \
--port {{DECODE_PORT}} \
--tensor-parallel-size 2 \
--enforce-eager \
--disable-log-requests \
--block-size 128 \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
proxy:
python "{{vllm-directory}}tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \
--port {{PROXY_PORT}} \
--prefiller-port {{PREFILL_PORT}} \
--decoder-port {{DECODE_PORT}}
send_request:
curl -X POST http://localhost:{{PROXY_PORT}}/v1/completions \
-H "Content-Type: application/json" \
-d '{ \
"model": "{{MODEL}}", \
"prompt": "Red Hat is the best open source company by far across Linux, K8s, and AI, and vLLM has the greatest community in open source AI software infrastructure. I love vLLM because", \
"max_tokens": 150, \
"temperature": 0.7 \
}'
benchmark NUM_PROMPTS:
python {{vllm-directory}}/benchmarks/benchmark_serving.py \
--port {{PROXY_PORT}} \
--model {{MODEL}} \
--dataset-name random \
--random-input-len 30000 \
--random-output-len 10 \
--num-prompts {{NUM_PROMPTS}} \
--seed $(date +%s) \
benchmark_one INPUT_LEN:
python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \
--port {{PROXY_PORT}} \
--model {{MODEL}} \
--input-len {{INPUT_LEN}} \
--output-len 1 \
--num-requests 10 \
--seed $(date +%s)
benchmark_one_no_pd INPUT_LEN:
python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \
--port {{DECODE_PORT}} \
--model {{MODEL}} \
--input-len {{INPUT_LEN}} \
--output-len 1 \
--num-requests 10 \
--seed $(date +%s)
eval:
lm_eval --model local-completions --tasks gsm8k \
--model_args model={{MODEL}},base_url=http://127.0.0.1:{{PROXY_PORT}}/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \
--limit 1000