[Core] Update outlines and increase its threadpool size (#11140)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-12-20 06:55:01 +08:00 · 2024-12-14 02:46:18 -05:00 · 2024-12-14 02:46:18 -05:00 · 48259264a4
commit 48259264a4
parent 24a3d12b82
2 changed files with 11 additions and 2 deletions
--- a/requirements-common.txt
+++ b/requirements-common.txt
@ -19,7 +19,7 @@ prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
-outlines == 0.1.9
+outlines == 0.1.11
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@ -1,5 +1,6 @@
 import asyncio
 import concurrent.futures
 import os
 from enum import Enum
 from json import dumps as json_dumps
 from re import escape as regex_escape
@ -48,6 +49,11 @@ pair   : UNESCAPED_STRING ":" value
 global_thread_pool = None  # used for generating logits processor fsm
 # It's not yet clear that using more provides a benefit, and it could
 # potentially starve other processes on the machine. We'll cap this for now and
 # adjust later if testing proves it to help overcome a bottleneck.
 _MAX_THREADPOOL_WORKERS = 16
 async def get_outlines_guided_decoding_logits_processor(
    guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
@ -65,8 +71,11 @@ async def get_outlines_guided_decoding_logits_processor(
        return None
    if global_thread_pool is None:
        max_workers = os.cpu_count() or 2
        if max_workers > _MAX_THREADPOOL_WORKERS:
            max_workers = _MAX_THREADPOOL_WORKERS
        global_thread_pool = concurrent.futures.ThreadPoolExecutor(
-            max_workers=2)
+            max_workers=max_workers)
    loop = asyncio.get_running_loop()
    return await loop.run_in_executor(global_thread_pool,