mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-20 06:55:01 +08:00
[Core] Update outlines and increase its threadpool size (#11140)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
This commit is contained in:
parent
24a3d12b82
commit
48259264a4
@ -19,7 +19,7 @@ prometheus_client >= 0.18.0
|
|||||||
prometheus-fastapi-instrumentator >= 7.0.0
|
prometheus-fastapi-instrumentator >= 7.0.0
|
||||||
tiktoken >= 0.6.0 # Required for DBRX tokenizer
|
tiktoken >= 0.6.0 # Required for DBRX tokenizer
|
||||||
lm-format-enforcer >= 0.10.9, < 0.11
|
lm-format-enforcer >= 0.10.9, < 0.11
|
||||||
outlines == 0.1.9
|
outlines == 0.1.11
|
||||||
xgrammar >= 0.1.6; platform_machine == "x86_64"
|
xgrammar >= 0.1.6; platform_machine == "x86_64"
|
||||||
typing_extensions >= 4.10
|
typing_extensions >= 4.10
|
||||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
|
import os
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from json import dumps as json_dumps
|
from json import dumps as json_dumps
|
||||||
from re import escape as regex_escape
|
from re import escape as regex_escape
|
||||||
@ -48,6 +49,11 @@ pair : UNESCAPED_STRING ":" value
|
|||||||
|
|
||||||
global_thread_pool = None # used for generating logits processor fsm
|
global_thread_pool = None # used for generating logits processor fsm
|
||||||
|
|
||||||
|
# It's not yet clear that using more provides a benefit, and it could
|
||||||
|
# potentially starve other processes on the machine. We'll cap this for now and
|
||||||
|
# adjust later if testing proves it to help overcome a bottleneck.
|
||||||
|
_MAX_THREADPOOL_WORKERS = 16
|
||||||
|
|
||||||
|
|
||||||
async def get_outlines_guided_decoding_logits_processor(
|
async def get_outlines_guided_decoding_logits_processor(
|
||||||
guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
|
guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
|
||||||
@ -65,8 +71,11 @@ async def get_outlines_guided_decoding_logits_processor(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
if global_thread_pool is None:
|
if global_thread_pool is None:
|
||||||
|
max_workers = os.cpu_count() or 2
|
||||||
|
if max_workers > _MAX_THREADPOOL_WORKERS:
|
||||||
|
max_workers = _MAX_THREADPOOL_WORKERS
|
||||||
global_thread_pool = concurrent.futures.ThreadPoolExecutor(
|
global_thread_pool = concurrent.futures.ThreadPoolExecutor(
|
||||||
max_workers=2)
|
max_workers=max_workers)
|
||||||
loop = asyncio.get_running_loop()
|
loop = asyncio.get_running_loop()
|
||||||
|
|
||||||
return await loop.run_in_executor(global_thread_pool,
|
return await loop.run_in_executor(global_thread_pool,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user