From b724afe343b788d61ca0425892f2631669f97f45 Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Thu, 24 Apr 2025 21:15:03 +0800 Subject: [PATCH] [V1][Structured Output] Clear xgrammar compiler object when engine core shut down to avoid nanobind leaked warning (#16954) Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm/v1/engine/core.py | 1 + vllm/v1/structured_output/__init__.py | 4 ++++ vllm/v1/structured_output/backend_guidance.py | 3 +++ vllm/v1/structured_output/backend_types.py | 6 ++++++ vllm/v1/structured_output/backend_xgrammar.py | 3 +++ 5 files changed, 17 insertions(+) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 65b97eb4a72e4..9590a9aadbec2 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -253,6 +253,7 @@ class EngineCore: return engine_core_outputs def shutdown(self): + self.structured_output_manager.clear_backend() if self.model_executor: self.model_executor.shutdown() diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 218af43deb677..0fd66c0729602 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -107,3 +107,7 @@ class StructuredOutputManager: # np.ndarray, because that is much more efficient for serialization # and deserialization when sending this to the GPU workers. return bitmask_tensor.numpy() + + def clear_backend(self) -> None: + if self.backend is not None: + self.backend.destroy() diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index a59ec5efc53e1..6d2ccd4019d4c 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -108,6 +108,9 @@ class GuidanceBackend(StructuredOutputBackend): return llguidance_torch.allocate_token_bitmask( max_num_seqs, self.ll_tokenizer.vocab_size) + def destroy(self): + pass + @dataclass class GuidanceGrammar(StructuredOutputGrammar): diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py index 6dc2a92411de0..306e4aa0196cc 100644 --- a/vllm/v1/structured_output/backend_types.py +++ b/vllm/v1/structured_output/backend_types.py @@ -87,3 +87,9 @@ class StructuredOutputBackend(ABC): max_num_seqs (int): The maximum number of sequences for which to allocate the bitmask. """ + + @abstractmethod + def destroy(self): + """ + Backend-specific cleanup. + """ diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index 0b5a1593b3eb6..d978ae2da85ce 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -124,6 +124,9 @@ class XgrammarBackend(StructuredOutputBackend): def allocate_token_bitmask(self, max_num_seqs: int): return xgr.allocate_token_bitmask(max_num_seqs, self.vocab_size) + def destroy(self): + del self.compiler + @dataclass class XgrammarGrammar(StructuredOutputGrammar):