From e206b5433109d298e53451015465b2bf8f03ef0a Mon Sep 17 00:00:00 2001
From: Seth Kimmel <seth.kimmel3@gmail.com>
Date: Tue, 25 Feb 2025 22:58:24 -0800
Subject: [PATCH] [v0][Core] Use xgrammar shared context to avoid copy overhead
 for offline engine (#13837)

Signed-off-by: Seth Kimmel <seth.kimmel3@gmail.com>
---
 .../guided_decoding/xgrammar_decoding.py      | 26 ++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 329b03a573dae..e6ba7f5ecc6ee 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -3,7 +3,6 @@
 # noqa: UP007
 from __future__ import annotations
 
-import copy
 import json
 import re
 from dataclasses import dataclass, field
@@ -348,5 +347,26 @@ class XGrammarLogitsProcessor:
         return scores
 
     def clone(self) -> XGrammarLogitsProcessor:
-        """Deepcopy due to per-sequence state in the matchers"""
-        return copy.deepcopy(self)
+        """Create a new instance with shared compiled grammar
+          but separate state"""
+        new_processor = XGrammarLogitsProcessor(self.config)
+
+        # Share the compiled grammar context (immutable after compilation)
+        new_processor.ctx = self.ctx
+
+        # Create fresh matchers for the new sequence
+        if self.ctx is not None:
+            new_processor.matchers = [
+                xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
+            ]
+
+        # Create a new token bitmask with the same size
+        if hasattr(self, 'token_bitmask') and self.token_bitmask is not None:
+            new_processor.token_bitmask = self.token_bitmask
+
+        # Copy simple attributes
+        new_processor.batch_size = self.batch_size
+        # Reset prefilled state for new sequence
+        new_processor.prefilled = False
+
+        return new_processor