mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-23 16:14:37 +08:00
Upgrade xgrammar to 0.1.23 (#22988)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
This commit is contained in:
parent
42dc59dbac
commit
e32a0e8678
@ -25,7 +25,7 @@ outlines == 0.1.11 ; platform_machine == "s390x"
|
||||
# required for outlines backend disk cache
|
||||
diskcache == 5.6.3
|
||||
lark == 1.2.2
|
||||
xgrammar == 0.1.21; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
|
||||
xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
|
||||
typing_extensions >= 4.10
|
||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||
partial-json-parser # used for parsing partial JSON outputs
|
||||
|
||||
@ -90,15 +90,11 @@ from .utils import (AttentionGroup, MultiModalBudget,
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import xgrammar as xgr
|
||||
import xgrammar.kernels.apply_token_bitmask_inplace_torch_compile as xgr_torch_compile # noqa: E501
|
||||
|
||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
else:
|
||||
xgr = LazyLoader("xgr", globals(), "xgrammar")
|
||||
xgr_torch_compile = LazyLoader(
|
||||
"xgr_torch_compile", globals(),
|
||||
"xgrammar.kernels.apply_token_bitmask_inplace_torch_compile")
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -1333,10 +1329,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
# so we receive it in that format.
|
||||
grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous()
|
||||
|
||||
# Force use of the torch.compile implementation from xgrammar to work
|
||||
# around issues with the Triton kernel in concurrent structured output
|
||||
# scenarios. See PR #19565 and issues #19493, #18376 for details.
|
||||
xgr_torch_compile.apply_token_bitmask_inplace_torch_compile(
|
||||
xgr.apply_token_bitmask_inplace(
|
||||
logits,
|
||||
grammar_bitmask.to(self.device, non_blocking=True),
|
||||
indices=out_indices if not skip_out_indices else None,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user