From c47aafa37c7579c3f9b3188b05f43cb71d83dbb5 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 19 Mar 2025 18:30:43 -0700 Subject: [PATCH] [BugFix] Lazily import XgrammarBackend to avoid early cuda init (#15171) Signed-off-by: Nick Hill --- vllm/v1/structured_output/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 58ac00e985a9..0fdc45c279cb 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -9,7 +9,6 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.v1.structured_output.backend_types import (StructuredOutputBackend, StructuredOutputGrammar) -from vllm.v1.structured_output.backend_xgrammar import XgrammarBackend if TYPE_CHECKING: import numpy as np @@ -47,6 +46,9 @@ class StructuredOutputManager: if self.backend is None: backend_name = request.sampling_params.guided_decoding.backend_name if backend_name == "xgrammar": + from vllm.v1.structured_output.backend_xgrammar import ( + XgrammarBackend) + self.backend = XgrammarBackend(self.vllm_config) else: raise ValueError(