Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
This commit is contained in:
rshaw@neuralmagic.com 2025-07-01 02:49:54 +00:00
parent 56939c835d
commit ff5a0cfa6e
2 changed files with 5 additions and 4 deletions

View File

@ -12,8 +12,9 @@ port PORT:
prefill:
VLLM_IS_PREFILL=1 \
VLLM_NIXL_SIDE_CHANNEL_PORT=5557 \
CUDA_VISIBLE_DEVICES=0 \
CUDA_VISIBLE_DEVICES=7 \
vllm serve {{MODEL}} \
--port $(just port 8100) \
--tensor-parallel-size 1 \
@ -23,8 +24,9 @@ prefill:
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
decode:
VLLM_IS_PREFILL=0 \
VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5559) \
CUDA_VISIBLE_DEVICES=1 \
CUDA_VISIBLE_DEVICES=6 \
vllm serve {{MODEL}} \
--port $(just port 8300) \
--tensor-parallel-size 1 \

View File

@ -7,7 +7,6 @@ import time
import uuid
from collections import defaultdict
from collections.abc import Iterator
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Optional
@ -335,7 +334,7 @@ class NixlConnectorWorker:
# Agent.
import os
num_workers = 32
num_workers = 64
# setting num_workers on the prefiller causes no notifs to be recved???
# this is a hack to make sure we set num workers on the prefiller to 1.
if os.getenv("VLLM_IS_PREFILL", "0") == "1":