From b724afe343b788d61ca0425892f2631669f97f45 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Thu, 24 Apr 2025 21:15:03 +0800
Subject: [PATCH] [V1][Structured Output] Clear xgrammar compiler object when
 engine core shut down to avoid nanobind leaked warning (#16954)

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 vllm/v1/engine/core.py                        | 1 +
 vllm/v1/structured_output/__init__.py         | 4 ++++
 vllm/v1/structured_output/backend_guidance.py | 3 +++
 vllm/v1/structured_output/backend_types.py    | 6 ++++++
 vllm/v1/structured_output/backend_xgrammar.py | 3 +++
 5 files changed, 17 insertions(+)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 65b97eb4a72e4..9590a9aadbec2 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -253,6 +253,7 @@ class EngineCore:
         return engine_core_outputs
 
     def shutdown(self):
+        self.structured_output_manager.clear_backend()
         if self.model_executor:
             self.model_executor.shutdown()
 
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 218af43deb677..0fd66c0729602 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -107,3 +107,7 @@ class StructuredOutputManager:
         # np.ndarray, because that is much more efficient for serialization
         # and deserialization when sending this to the GPU workers.
         return bitmask_tensor.numpy()
+
+    def clear_backend(self) -> None:
+        if self.backend is not None:
+            self.backend.destroy()
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index a59ec5efc53e1..6d2ccd4019d4c 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -108,6 +108,9 @@ class GuidanceBackend(StructuredOutputBackend):
         return llguidance_torch.allocate_token_bitmask(
             max_num_seqs, self.ll_tokenizer.vocab_size)
 
+    def destroy(self):
+        pass
+
 
 @dataclass
 class GuidanceGrammar(StructuredOutputGrammar):
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
index 6dc2a92411de0..306e4aa0196cc 100644
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -87,3 +87,9 @@ class StructuredOutputBackend(ABC):
             max_num_seqs (int): The maximum number of sequences for which
               to allocate the bitmask.
         """
+
+    @abstractmethod
+    def destroy(self):
+        """
+        Backend-specific cleanup.
+        """
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 0b5a1593b3eb6..d978ae2da85ce 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -124,6 +124,9 @@ class XgrammarBackend(StructuredOutputBackend):
     def allocate_token_bitmask(self, max_num_seqs: int):
         return xgr.allocate_token_bitmask(max_num_seqs, self.vocab_size)
 
+    def destroy(self):
+        del self.compiler
+
 
 @dataclass
 class XgrammarGrammar(StructuredOutputGrammar):