[Core] Upgrade to xgrammar 0.1.18, add cache size limit (#16283)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
This commit is contained in:
Russell Bryant 2025-04-08 22:13:22 -04:00 committed by GitHub
parent 4716377fbc
commit cb84e45ac7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 22 additions and 3 deletions

View File

@ -22,7 +22,7 @@ lm-format-enforcer >= 0.10.11, < 0.11
llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
outlines == 0.1.11 outlines == 0.1.11
lark == 1.2.2 lark == 1.2.2
xgrammar == 0.1.17; platform_machine == "x86_64" or platform_machine == "aarch64" xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64"
typing_extensions >= 4.10 typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs partial-json-parser # used for parsing partial JSON outputs

View File

@ -106,6 +106,7 @@ if TYPE_CHECKING:
VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
VLLM_TPU_BUCKET_PADDING_GAP: int = 0 VLLM_TPU_BUCKET_PADDING_GAP: int = 0
VLLM_USE_DEEP_GEMM: bool = False VLLM_USE_DEEP_GEMM: bool = False
VLLM_XGRAMMAR_CACHE_MB: int = 0
def get_default_cache_root(): def get_default_cache_root():
@ -697,6 +698,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Allow use of DeepGemm kernels for fused moe ops. # Allow use of DeepGemm kernels for fused moe ops.
"VLLM_USE_DEEP_GEMM": "VLLM_USE_DEEP_GEMM":
lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))), lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
# Control the cache sized used by the xgrammar compiler. The default
# of 512 MB should be enough for roughly 1000 JSON schemas.
# It can be changed with this variable if needed for some reason.
"VLLM_XGRAMMAR_CACHE_MB":
lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")),
} }
# end-env-vars-definition # end-env-vars-definition

View File

@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, List
import torch import torch
import vllm.envs
from vllm.logger import init_logger from vllm.logger import init_logger
try: try:
@ -131,8 +132,13 @@ class GrammarCompilerCache:
encoded_vocab=config_data.encoded_vocab, encoded_vocab=config_data.encoded_vocab,
metadata=config_data.metadata, metadata=config_data.metadata,
) )
cache_size = vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024
cls._cache[cache_key] = xgr.GrammarCompiler( cls._cache[cache_key] = xgr.GrammarCompiler(
tokenizer_info, max_threads=config.max_threads) tokenizer_info,
max_threads=config.max_threads,
cache_enabled=True,
cache_limit_bytes=cache_size,
)
return cls._cache[cache_key] return cls._cache[cache_key]

View File

@ -5,6 +5,7 @@ from typing import TYPE_CHECKING
import torch import torch
import vllm.envs
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
@ -76,7 +77,12 @@ class XgrammarBackend(StructuredOutputBackend):
tokenizer, tokenizer,
vocab_size=self.vocab_size, vocab_size=self.vocab_size,
) )
self.compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8) self.compiler = xgr.GrammarCompiler(
tokenizer_info,
max_threads=8,
cache_enabled=True,
cache_limit_bytes=vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024,
)
def compile_grammar(self, request_type: StructuredOutputOptions, def compile_grammar(self, request_type: StructuredOutputOptions,
grammar_spec: str) -> StructuredOutputGrammar: grammar_spec: str) -> StructuredOutputGrammar: