[Core] Upgrade to xgrammar 0.1.18, add cache size limit (#16283)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
This commit is contained in:
Russell Bryant 2025-04-08 22:13:22 -04:00 committed by GitHub
parent 4716377fbc
commit cb84e45ac7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 22 additions and 3 deletions

View File

@ -22,7 +22,7 @@ lm-format-enforcer >= 0.10.11, < 0.11
llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
outlines == 0.1.11
lark == 1.2.2
xgrammar == 0.1.17; platform_machine == "x86_64" or platform_machine == "aarch64"
xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64"
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs

View File

@ -106,6 +106,7 @@ if TYPE_CHECKING:
VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
VLLM_TPU_BUCKET_PADDING_GAP: int = 0
VLLM_USE_DEEP_GEMM: bool = False
VLLM_XGRAMMAR_CACHE_MB: int = 0
def get_default_cache_root():
@ -697,6 +698,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Allow use of DeepGemm kernels for fused moe ops.
"VLLM_USE_DEEP_GEMM":
lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
# Control the cache sized used by the xgrammar compiler. The default
# of 512 MB should be enough for roughly 1000 JSON schemas.
# It can be changed with this variable if needed for some reason.
"VLLM_XGRAMMAR_CACHE_MB":
lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")),
}
# end-env-vars-definition

View File

@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, List
import torch
import vllm.envs
from vllm.logger import init_logger
try:
@ -131,8 +132,13 @@ class GrammarCompilerCache:
encoded_vocab=config_data.encoded_vocab,
metadata=config_data.metadata,
)
cache_size = vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024
cls._cache[cache_key] = xgr.GrammarCompiler(
tokenizer_info, max_threads=config.max_threads)
tokenizer_info,
max_threads=config.max_threads,
cache_enabled=True,
cache_limit_bytes=cache_size,
)
return cls._cache[cache_key]

View File

@ -5,6 +5,7 @@ from typing import TYPE_CHECKING
import torch
import vllm.envs
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
@ -76,7 +77,12 @@ class XgrammarBackend(StructuredOutputBackend):
tokenizer,
vocab_size=self.vocab_size,
)
self.compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8)
self.compiler = xgr.GrammarCompiler(
tokenizer_info,
max_threads=8,
cache_enabled=True,
cache_limit_bytes=vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024,
)
def compile_grammar(self, request_type: StructuredOutputOptions,
grammar_spec: str) -> StructuredOutputGrammar: