# Needed for the proxy server vllm-directory := "/home/rshaw/vllm/" # MODEL := "Qwen/Qwen3-0.6B" MODEL := "meta-llama/Llama-3.1-8B-Instruct" PROXY_PORT := "8192" PREFILL_PORT := "8100" DECODE_PORT := "8200" prefill: VLLM_NIXL_SIDE_CHANNEL_PORT=5557 \ CUDA_VISIBLE_DEVICES=0,7 \ vllm serve {{MODEL}} \ --port {{PREFILL_PORT}} \ --tensor-parallel-size 2 \ --enforce-eager \ --disable-log-requests \ --block-size 128 \ --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' decode: VLLM_NIXL_SIDE_CHANNEL_PORT=5567 \ CUDA_VISIBLE_DEVICES=4,5 \ vllm serve {{MODEL}} \ --port {{DECODE_PORT}} \ --tensor-parallel-size 2 \ --enforce-eager \ --disable-log-requests \ --block-size 128 \ --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' proxy: python "{{vllm-directory}}tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \ --port {{PROXY_PORT}} \ --prefiller-port {{PREFILL_PORT}} \ --decoder-port {{DECODE_PORT}} send_request: curl -X POST http://localhost:{{PROXY_PORT}}/v1/completions \ -H "Content-Type: application/json" \ -d '{ \ "model": "{{MODEL}}", \ "prompt": "Red Hat is the best open source company by far across Linux, K8s, and AI, and vLLM has the greatest community in open source AI software infrastructure. I love vLLM because", \ "max_tokens": 150, \ "temperature": 0.7 \ }' benchmark NUM_PROMPTS: python {{vllm-directory}}/benchmarks/benchmark_serving.py \ --port {{PROXY_PORT}} \ --model {{MODEL}} \ --dataset-name random \ --random-input-len 30000 \ --random-output-len 10 \ --num-prompts {{NUM_PROMPTS}} \ --seed $(date +%s) \ benchmark_one INPUT_LEN: python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \ --port {{PROXY_PORT}} \ --model {{MODEL}} \ --input-len {{INPUT_LEN}} \ --output-len 1 \ --num-requests 10 \ --seed $(date +%s) benchmark_one_no_pd INPUT_LEN: python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \ --port {{DECODE_PORT}} \ --model {{MODEL}} \ --input-len {{INPUT_LEN}} \ --output-len 1 \ --num-requests 10 \ --seed $(date +%s) eval: lm_eval --model local-completions --tasks gsm8k \ --model_args model={{MODEL}},base_url=http://127.0.0.1:{{PROXY_PORT}}/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \ --limit 1000