From e206b5433109d298e53451015465b2bf8f03ef0a Mon Sep 17 00:00:00 2001 From: Seth Kimmel Date: Tue, 25 Feb 2025 22:58:24 -0800 Subject: [PATCH] [v0][Core] Use xgrammar shared context to avoid copy overhead for offline engine (#13837) Signed-off-by: Seth Kimmel --- .../guided_decoding/xgrammar_decoding.py | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index 329b03a573dae..e6ba7f5ecc6ee 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -3,7 +3,6 @@ # noqa: UP007 from __future__ import annotations -import copy import json import re from dataclasses import dataclass, field @@ -348,5 +347,26 @@ class XGrammarLogitsProcessor: return scores def clone(self) -> XGrammarLogitsProcessor: - """Deepcopy due to per-sequence state in the matchers""" - return copy.deepcopy(self) + """Create a new instance with shared compiled grammar + but separate state""" + new_processor = XGrammarLogitsProcessor(self.config) + + # Share the compiled grammar context (immutable after compilation) + new_processor.ctx = self.ctx + + # Create fresh matchers for the new sequence + if self.ctx is not None: + new_processor.matchers = [ + xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size) + ] + + # Create a new token bitmask with the same size + if hasattr(self, 'token_bitmask') and self.token_bitmask is not None: + new_processor.token_bitmask = self.token_bitmask + + # Copy simple attributes + new_processor.batch_size = self.batch_size + # Reset prefilled state for new sequence + new_processor.prefilled = False + + return new_processor