diff --git a/tools/pd_disagg/Justfile b/tools/pd_disagg/Justfile index 27bf415521eec..062281b8effad 100644 --- a/tools/pd_disagg/Justfile +++ b/tools/pd_disagg/Justfile @@ -1,24 +1,30 @@ # Needed for the proxy server vllm-directory := "/home/rshaw/vllm/" -PREFILL_GPU := "0" -DECODE_GPU := "2" +PREFILL_GPU := "0,1,2,3" +DECODE_GPU := "4,5,6,7" + +PREFILL_TP := env("PREFILL_TP", "1") +DECODE_TP := env("DECODE_TP", "1") + +BLOCK_SIZE := env("BLOCK_SIZE", "128") + MODEL := "meta-llama/Llama-3.1-8B-Instruct" PROXY_PORT := "8192" PREFILL_PORT := "8100" DECODE_PORT := "8200" PREFILL_NIXL_SIDE_CHANNEL_PORT := "5557" -DECODE_NIXL_SIDE_CHANNEL_PORT := "5558" +DECODE_NIXL_SIDE_CHANNEL_PORT := "5568" prefill: VLLM_NIXL_SIDE_CHANNEL_PORT={{PREFILL_NIXL_SIDE_CHANNEL_PORT}} \ CUDA_VISIBLE_DEVICES={{PREFILL_GPU}} \ vllm serve {{MODEL}} \ --port {{PREFILL_PORT}} \ - --tensor-parallel-size 1 \ + --tensor-parallel-size {{PREFILL_TP}} \ --enforce-eager \ --disable-log-requests \ - --block-size 128 \ + --block-size {{BLOCK_SIZE}} \ --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' decode: @@ -26,10 +32,10 @@ decode: CUDA_VISIBLE_DEVICES={{DECODE_GPU}} \ vllm serve {{MODEL}} \ --port {{DECODE_PORT}} \ - --tensor-parallel-size 1 \ + --tensor-parallel-size {{DECODE_TP}} \ --enforce-eager \ --disable-log-requests \ - --block-size 128 \ + --block-size {{BLOCK_SIZE}} \ --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' proxy: @@ -59,7 +65,7 @@ benchmark NUM_PROMPTS: --seed $(date +%s) \ benchmark_one INPUT_LEN: - python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \ + python {{vllm-directory}}benchmarks/benchmark_one_concurrent.py \ --port {{PROXY_PORT}} \ --model {{MODEL}} \ --input-len {{INPUT_LEN}} \ diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 29a1b66408ddc..ff120a04b0132 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -1044,10 +1044,7 @@ class NixlConnectorWorker: ) # Begin async xfer. - start = time.perf_counter() self.nixl_wrapper.transfer(handle) - end = time.perf_counter() - logger.info("========== TRANSFER: %s ==========", end - start) # Use handle to check completion in future step(). # TODO (NickLucche) surface xfer elapsed time