From bd57841c7ba605cf958f6185012a5b91726f5ff7 Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Mon, 7 Jul 2025 01:14:10 +0000
Subject: [PATCH] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 tools/Justfile | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 tools/Justfile

diff --git a/tools/Justfile b/tools/Justfile
new file mode 100644
index 0000000000000..0ffced37272ee
--- /dev/null
+++ b/tools/Justfile
@@ -0,0 +1,79 @@
+# Needed for the proxy server
+vllm-directory := "/home/rshaw/vllm/" 
+
+# MODEL := "Qwen/Qwen3-0.6B"
+MODEL := "meta-llama/Llama-3.1-8B-Instruct"
+PROXY_PORT := "8192"
+PREFILL_PORT := "8100"
+DECODE_PORT := "8200"
+
+prefill:
+    VLLM_NIXL_SIDE_CHANNEL_PORT=5557 \
+    CUDA_VISIBLE_DEVICES=0,7 \
+    vllm serve {{MODEL}} \
+      --port {{PREFILL_PORT}} \
+      --tensor-parallel-size 2 \
+      --enforce-eager \
+      --disable-log-requests \
+      --block-size 128 \
+      --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+
+decode:
+    VLLM_NIXL_SIDE_CHANNEL_PORT=5567 \
+    CUDA_VISIBLE_DEVICES=4,5 \
+    vllm serve {{MODEL}} \
+      --port {{DECODE_PORT}} \
+      --tensor-parallel-size 2 \
+      --enforce-eager \
+      --disable-log-requests \
+      --block-size 128 \
+      --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+
+proxy:
+    python "{{vllm-directory}}tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \
+      --port {{PROXY_PORT}} \
+      --prefiller-port {{PREFILL_PORT}} \
+      --decoder-port {{DECODE_PORT}}
+
+send_request:
+  curl -X POST http://localhost:{{PROXY_PORT}}/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{ \
+      "model": "{{MODEL}}", \
+      "prompt": "Red Hat is the best open source company by far across Linux, K8s, and AI, and vLLM has the greatest community in open source AI software infrastructure. I love vLLM because", \
+      "max_tokens": 150, \
+      "temperature": 0.7 \
+    }'
+
+benchmark NUM_PROMPTS:
+  python {{vllm-directory}}/benchmarks/benchmark_serving.py \
+    --port {{PROXY_PORT}} \
+    --model {{MODEL}} \
+    --dataset-name random \
+    --random-input-len 30000 \
+    --random-output-len 10 \
+    --num-prompts {{NUM_PROMPTS}} \
+    --seed $(date +%s) \
+
+benchmark_one INPUT_LEN:
+  python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \
+    --port {{PROXY_PORT}} \
+    --model {{MODEL}} \
+    --input-len {{INPUT_LEN}} \
+    --output-len 1 \
+    --num-requests 10 \
+    --seed $(date +%s)
+
+benchmark_one_no_pd INPUT_LEN:
+  python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \
+    --port {{DECODE_PORT}} \
+    --model {{MODEL}} \
+    --input-len {{INPUT_LEN}} \
+    --output-len 1 \
+    --num-requests 10 \
+    --seed $(date +%s)
+
+eval:
+  lm_eval --model local-completions --tasks gsm8k \
+    --model_args model={{MODEL}},base_url=http://127.0.0.1:{{PROXY_PORT}}/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \
+    --limit 1000