From bd57841c7ba605cf958f6185012a5b91726f5ff7 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Mon, 7 Jul 2025 01:14:10 +0000 Subject: [PATCH] updated Signed-off-by: Robert Shaw --- tools/Justfile | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 tools/Justfile diff --git a/tools/Justfile b/tools/Justfile new file mode 100644 index 0000000000000..0ffced37272ee --- /dev/null +++ b/tools/Justfile @@ -0,0 +1,79 @@ +# Needed for the proxy server +vllm-directory := "/home/rshaw/vllm/" + +# MODEL := "Qwen/Qwen3-0.6B" +MODEL := "meta-llama/Llama-3.1-8B-Instruct" +PROXY_PORT := "8192" +PREFILL_PORT := "8100" +DECODE_PORT := "8200" + +prefill: + VLLM_NIXL_SIDE_CHANNEL_PORT=5557 \ + CUDA_VISIBLE_DEVICES=0,7 \ + vllm serve {{MODEL}} \ + --port {{PREFILL_PORT}} \ + --tensor-parallel-size 2 \ + --enforce-eager \ + --disable-log-requests \ + --block-size 128 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + +decode: + VLLM_NIXL_SIDE_CHANNEL_PORT=5567 \ + CUDA_VISIBLE_DEVICES=4,5 \ + vllm serve {{MODEL}} \ + --port {{DECODE_PORT}} \ + --tensor-parallel-size 2 \ + --enforce-eager \ + --disable-log-requests \ + --block-size 128 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + +proxy: + python "{{vllm-directory}}tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \ + --port {{PROXY_PORT}} \ + --prefiller-port {{PREFILL_PORT}} \ + --decoder-port {{DECODE_PORT}} + +send_request: + curl -X POST http://localhost:{{PROXY_PORT}}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ \ + "model": "{{MODEL}}", \ + "prompt": "Red Hat is the best open source company by far across Linux, K8s, and AI, and vLLM has the greatest community in open source AI software infrastructure. I love vLLM because", \ + "max_tokens": 150, \ + "temperature": 0.7 \ + }' + +benchmark NUM_PROMPTS: + python {{vllm-directory}}/benchmarks/benchmark_serving.py \ + --port {{PROXY_PORT}} \ + --model {{MODEL}} \ + --dataset-name random \ + --random-input-len 30000 \ + --random-output-len 10 \ + --num-prompts {{NUM_PROMPTS}} \ + --seed $(date +%s) \ + +benchmark_one INPUT_LEN: + python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \ + --port {{PROXY_PORT}} \ + --model {{MODEL}} \ + --input-len {{INPUT_LEN}} \ + --output-len 1 \ + --num-requests 10 \ + --seed $(date +%s) + +benchmark_one_no_pd INPUT_LEN: + python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \ + --port {{DECODE_PORT}} \ + --model {{MODEL}} \ + --input-len {{INPUT_LEN}} \ + --output-len 1 \ + --num-requests 10 \ + --seed $(date +%s) + +eval: + lm_eval --model local-completions --tasks gsm8k \ + --model_args model={{MODEL}},base_url=http://127.0.0.1:{{PROXY_PORT}}/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \ + --limit 1000