[v0][Core] Use xgrammar shared context to avoid copy overhead for offline engine (#13837)

Signed-off-by: Seth Kimmel <seth.kimmel3@gmail.com>
2025-12-24 15:46:51 +08:00 · 2025-02-25 22:58:24 -08:00 · 2025-02-25 22:58:24 -08:00 · e206b54331
commit e206b54331
parent 1d35662e6d
1 changed files with 23 additions and 3 deletions
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@ -3,7 +3,6 @@
 # noqa: UP007
 from __future__ import annotations

-import copy
 import json
 import re
 from dataclasses import dataclass, field
@ -348,5 +347,26 @@ class XGrammarLogitsProcessor:
        return scores

    def clone(self) -> XGrammarLogitsProcessor:
-        """Deepcopy due to per-sequence state in the matchers"""
-        return copy.deepcopy(self)
+        """Create a new instance with shared compiled grammar
+          but separate state"""
+        new_processor = XGrammarLogitsProcessor(self.config)
+
+        # Share the compiled grammar context (immutable after compilation)
+        new_processor.ctx = self.ctx
+
+        # Create fresh matchers for the new sequence
+        if self.ctx is not None:
+            new_processor.matchers = [
+                xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
+            ]
+
+        # Create a new token bitmask with the same size
+        if hasattr(self, 'token_bitmask') and self.token_bitmask is not None:
+            new_processor.token_bitmask = self.token_bitmask
+
+        # Copy simple attributes
+        new_processor.batch_size = self.batch_size
+        # Reset prefilled state for new sequence
+        new_processor.prefilled = False
+
+        return new_processor