mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-28 21:57:08 +08:00
[Core] Upgrade to xgrammar 0.1.18, add cache size limit (#16283)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
This commit is contained in:
parent
4716377fbc
commit
cb84e45ac7
@ -22,7 +22,7 @@ lm-format-enforcer >= 0.10.11, < 0.11
|
|||||||
llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
|
llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
|
||||||
outlines == 0.1.11
|
outlines == 0.1.11
|
||||||
lark == 1.2.2
|
lark == 1.2.2
|
||||||
xgrammar == 0.1.17; platform_machine == "x86_64" or platform_machine == "aarch64"
|
xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||||
typing_extensions >= 4.10
|
typing_extensions >= 4.10
|
||||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||||
partial-json-parser # used for parsing partial JSON outputs
|
partial-json-parser # used for parsing partial JSON outputs
|
||||||
|
|||||||
@ -106,6 +106,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
|
VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
|
||||||
VLLM_TPU_BUCKET_PADDING_GAP: int = 0
|
VLLM_TPU_BUCKET_PADDING_GAP: int = 0
|
||||||
VLLM_USE_DEEP_GEMM: bool = False
|
VLLM_USE_DEEP_GEMM: bool = False
|
||||||
|
VLLM_XGRAMMAR_CACHE_MB: int = 0
|
||||||
|
|
||||||
|
|
||||||
def get_default_cache_root():
|
def get_default_cache_root():
|
||||||
@ -697,6 +698,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# Allow use of DeepGemm kernels for fused moe ops.
|
# Allow use of DeepGemm kernels for fused moe ops.
|
||||||
"VLLM_USE_DEEP_GEMM":
|
"VLLM_USE_DEEP_GEMM":
|
||||||
lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
|
lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
|
||||||
|
|
||||||
|
# Control the cache sized used by the xgrammar compiler. The default
|
||||||
|
# of 512 MB should be enough for roughly 1000 JSON schemas.
|
||||||
|
# It can be changed with this variable if needed for some reason.
|
||||||
|
"VLLM_XGRAMMAR_CACHE_MB":
|
||||||
|
lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")),
|
||||||
}
|
}
|
||||||
|
|
||||||
# end-env-vars-definition
|
# end-env-vars-definition
|
||||||
|
|||||||
@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, List
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
import vllm.envs
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -131,8 +132,13 @@ class GrammarCompilerCache:
|
|||||||
encoded_vocab=config_data.encoded_vocab,
|
encoded_vocab=config_data.encoded_vocab,
|
||||||
metadata=config_data.metadata,
|
metadata=config_data.metadata,
|
||||||
)
|
)
|
||||||
|
cache_size = vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024
|
||||||
cls._cache[cache_key] = xgr.GrammarCompiler(
|
cls._cache[cache_key] = xgr.GrammarCompiler(
|
||||||
tokenizer_info, max_threads=config.max_threads)
|
tokenizer_info,
|
||||||
|
max_threads=config.max_threads,
|
||||||
|
cache_enabled=True,
|
||||||
|
cache_limit_bytes=cache_size,
|
||||||
|
)
|
||||||
|
|
||||||
return cls._cache[cache_key]
|
return cls._cache[cache_key]
|
||||||
|
|
||||||
|
|||||||
@ -5,6 +5,7 @@ from typing import TYPE_CHECKING
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
import vllm.envs
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
|
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
|
||||||
@ -76,7 +77,12 @@ class XgrammarBackend(StructuredOutputBackend):
|
|||||||
tokenizer,
|
tokenizer,
|
||||||
vocab_size=self.vocab_size,
|
vocab_size=self.vocab_size,
|
||||||
)
|
)
|
||||||
self.compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8)
|
self.compiler = xgr.GrammarCompiler(
|
||||||
|
tokenizer_info,
|
||||||
|
max_threads=8,
|
||||||
|
cache_enabled=True,
|
||||||
|
cache_limit_bytes=vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024,
|
||||||
|
)
|
||||||
|
|
||||||
def compile_grammar(self, request_type: StructuredOutputOptions,
|
def compile_grammar(self, request_type: StructuredOutputOptions,
|
||||||
grammar_spec: str) -> StructuredOutputGrammar:
|
grammar_spec: str) -> StructuredOutputGrammar:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user