[V1] Resolve failed concurrent structured output requests (#19565)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-12-22 20:55:01 +08:00 · 2025-06-12 19:30:09 -04:00 · 2025-06-12 19:30:09 -04:00 · c57bb199b3
commit c57bb199b3
parent dba68f9159
1 changed files with 8 additions and 1 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -66,11 +66,15 @@ from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,

 if TYPE_CHECKING:
    import xgrammar as xgr
+    import xgrammar.kernels.apply_token_bitmask_inplace_torch_compile as xgr_torch_compile  # noqa: E501

    from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
    from vllm.v1.core.sched.output import SchedulerOutput
 else:
    xgr = LazyLoader("xgr", globals(), "xgrammar")
+    xgr_torch_compile = LazyLoader(
+        "xgr_torch_compile", globals(),
+        "xgrammar.kernels.apply_token_bitmask_inplace_torch_compile")

 logger = init_logger(__name__)

@ -1103,7 +1107,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        # so we receive it in that format.
        grammar_bitmask = torch.from_numpy(grammar_bitmask)

-        xgr.apply_token_bitmask_inplace(
+        # Force use of the torch.compile implementation from xgrammar to work
+        # around issues with the Triton kernel in concurrent structured output
+        # scenarios. See PR #19565 and issues #19493, #18376 for details.
+        xgr_torch_compile.apply_token_bitmask_inplace_torch_compile(
            logits,
            grammar_bitmask.to(self.device, non_blocking=True),
            indices=out_indices,