mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-25 18:14:41 +08:00
93 lines
3.0 KiB
Makefile
93 lines
3.0 KiB
Makefile
# Setting this allows creating a symlink to Justfile from another dir
|
|
set working-directory := "/home/rshaw/vllm/pd_examples/"
|
|
|
|
# Needed for the proxy server
|
|
vllm-directory := "/home/rshaw/vllm/"
|
|
|
|
# MODEL := "Qwen/Qwen3-0.6B"
|
|
MODEL := "meta-llama/Llama-3.1-8B-Instruct"
|
|
|
|
port PORT:
|
|
@python port_allocator.py {{PORT}}
|
|
|
|
|
|
prefill:
|
|
VLLM_NIXL_SIDE_CHANNEL_PORT=5557 \
|
|
CUDA_VISIBLE_DEVICES=0 \
|
|
vllm serve {{MODEL}} \
|
|
--port $(just port 8100) \
|
|
--tensor-parallel-size 1 \
|
|
--enforce-eager \
|
|
--disable-log-requests \
|
|
--block-size 128 \
|
|
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
|
|
|
|
decode:
|
|
VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5559) \
|
|
CUDA_VISIBLE_DEVICES=1 \
|
|
vllm serve {{MODEL}} \
|
|
--port $(just port 8300) \
|
|
--tensor-parallel-size 1 \
|
|
--enforce-eager \
|
|
--disable-log-requests \
|
|
--block-size 128 \
|
|
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
|
|
|
|
proxy:
|
|
python "{{vllm-directory}}tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \
|
|
--port $(just port 8192) \
|
|
--prefiller-port $(just port 8100) \
|
|
--prefiller-host localhost \
|
|
--decoder-port $(just port 8300)
|
|
|
|
send_request:
|
|
curl -X POST http://localhost:$(just port 8192)/v1/completions \
|
|
-H "Content-Type: application/json" \
|
|
-d '{ \
|
|
"model": "{{MODEL}}", \
|
|
"prompt": "XXRed Hat is the best open source company by far across Linux, K8s, and AI, and vLLM has the greatest community in open source AI software infrastructure. I love vLLM because", \
|
|
"max_tokens": 150, \
|
|
"temperature": 0.7 \
|
|
}'
|
|
|
|
benchmark NUM_PROMPTS:
|
|
python {{vllm-directory}}/benchmarks/benchmark_serving.py \
|
|
--port $(just port 8192) \
|
|
--model {{MODEL}} \
|
|
--dataset-name random \
|
|
--random-input-len 10000 \
|
|
--random-output-len 100 \
|
|
--num-prompts {{NUM_PROMPTS}} \
|
|
--seed $(date +%s) \
|
|
|
|
benchmark_one INPUT_LEN:
|
|
python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \
|
|
--model {{MODEL}} \
|
|
--input-len {{INPUT_LEN}} \
|
|
--output-len 1 \
|
|
--num-requests 10 \
|
|
--seed $(date +%s) \
|
|
--port $(just port 8192)
|
|
|
|
benchmark_one_no_pd INPUT_LEN:
|
|
python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \
|
|
--model {{MODEL}} \
|
|
--input-len {{INPUT_LEN}} \
|
|
--output-len 1 \
|
|
--num-requests 10 \
|
|
--seed $(date +%s) \
|
|
--port $(just port 8100)
|
|
|
|
reset_prefix_cache:
|
|
curl -X POST http://localhost:$(just port 8100)/reset_prefix_cache && \
|
|
curl -X POST http://localhost:$(just port 8200)/reset_prefix_cache
|
|
|
|
eval:
|
|
lm_eval --model local-completions --tasks gsm8k \
|
|
--model_args model={{MODEL}},base_url=http://127.0.0.1:$(just port 8192)/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \
|
|
--limit 1000
|
|
|
|
eval_port PORT:
|
|
lm_eval --model local-completions --tasks gsm8k \
|
|
--model_args model={{MODEL}},base_url=http://127.0.0.1:$(just port {{PORT}})/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \
|
|
--limit 1000
|