mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-26 19:25:52 +08:00
parent
56939c835d
commit
ff5a0cfa6e
@ -12,8 +12,9 @@ port PORT:
|
||||
|
||||
|
||||
prefill:
|
||||
VLLM_IS_PREFILL=1 \
|
||||
VLLM_NIXL_SIDE_CHANNEL_PORT=5557 \
|
||||
CUDA_VISIBLE_DEVICES=0 \
|
||||
CUDA_VISIBLE_DEVICES=7 \
|
||||
vllm serve {{MODEL}} \
|
||||
--port $(just port 8100) \
|
||||
--tensor-parallel-size 1 \
|
||||
@ -23,8 +24,9 @@ prefill:
|
||||
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
|
||||
|
||||
decode:
|
||||
VLLM_IS_PREFILL=0 \
|
||||
VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5559) \
|
||||
CUDA_VISIBLE_DEVICES=1 \
|
||||
CUDA_VISIBLE_DEVICES=6 \
|
||||
vllm serve {{MODEL}} \
|
||||
--port $(just port 8300) \
|
||||
--tensor-parallel-size 1 \
|
||||
|
||||
@ -7,7 +7,6 @@ import time
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from collections.abc import Iterator
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
@ -335,7 +334,7 @@ class NixlConnectorWorker:
|
||||
|
||||
# Agent.
|
||||
import os
|
||||
num_workers = 32
|
||||
num_workers = 64
|
||||
# setting num_workers on the prefiller causes no notifs to be recved???
|
||||
# this is a hack to make sure we set num workers on the prefiller to 1.
|
||||
if os.getenv("VLLM_IS_PREFILL", "0") == "1":
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user