mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-24 23:25:01 +08:00
[Core] Upgrade to xgrammar 0.1.18, add cache size limit (#16283)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
This commit is contained in:
parent
4716377fbc
commit
cb84e45ac7
@ -22,7 +22,7 @@ lm-format-enforcer >= 0.10.11, < 0.11
|
||||
llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
|
||||
outlines == 0.1.11
|
||||
lark == 1.2.2
|
||||
xgrammar == 0.1.17; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||
xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||
typing_extensions >= 4.10
|
||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||
partial-json-parser # used for parsing partial JSON outputs
|
||||
|
||||
@ -106,6 +106,7 @@ if TYPE_CHECKING:
|
||||
VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
|
||||
VLLM_TPU_BUCKET_PADDING_GAP: int = 0
|
||||
VLLM_USE_DEEP_GEMM: bool = False
|
||||
VLLM_XGRAMMAR_CACHE_MB: int = 0
|
||||
|
||||
|
||||
def get_default_cache_root():
|
||||
@ -697,6 +698,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# Allow use of DeepGemm kernels for fused moe ops.
|
||||
"VLLM_USE_DEEP_GEMM":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
|
||||
|
||||
# Control the cache sized used by the xgrammar compiler. The default
|
||||
# of 512 MB should be enough for roughly 1000 JSON schemas.
|
||||
# It can be changed with this variable if needed for some reason.
|
||||
"VLLM_XGRAMMAR_CACHE_MB":
|
||||
lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")),
|
||||
}
|
||||
|
||||
# end-env-vars-definition
|
||||
|
||||
@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, List
|
||||
|
||||
import torch
|
||||
|
||||
import vllm.envs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
try:
|
||||
@ -131,8 +132,13 @@ class GrammarCompilerCache:
|
||||
encoded_vocab=config_data.encoded_vocab,
|
||||
metadata=config_data.metadata,
|
||||
)
|
||||
cache_size = vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024
|
||||
cls._cache[cache_key] = xgr.GrammarCompiler(
|
||||
tokenizer_info, max_threads=config.max_threads)
|
||||
tokenizer_info,
|
||||
max_threads=config.max_threads,
|
||||
cache_enabled=True,
|
||||
cache_limit_bytes=cache_size,
|
||||
)
|
||||
|
||||
return cls._cache[cache_key]
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ from typing import TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
|
||||
import vllm.envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
|
||||
@ -76,7 +77,12 @@ class XgrammarBackend(StructuredOutputBackend):
|
||||
tokenizer,
|
||||
vocab_size=self.vocab_size,
|
||||
)
|
||||
self.compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8)
|
||||
self.compiler = xgr.GrammarCompiler(
|
||||
tokenizer_info,
|
||||
max_threads=8,
|
||||
cache_enabled=True,
|
||||
cache_limit_bytes=vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024,
|
||||
)
|
||||
|
||||
def compile_grammar(self, request_type: StructuredOutputOptions,
|
||||
grammar_spec: str) -> StructuredOutputGrammar:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user