From b854321ffe50fd04c6b1ac58eecdab4caf5b4295 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 10 Jul 2025 16:06:37 -0700
Subject: [PATCH] [Docs] Lazy import gguf (#20785)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 vllm/entrypoints/score_utils.py                  | 6 +++++-
 vllm/model_executor/model_loader/weight_utils.py | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 3fc4ed606b8a9..f3f042355c9eb 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -11,7 +11,6 @@ from vllm.entrypoints.chat_utils import (
     ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam,
     MultiModalItemTracker, _ContentPart, _parse_chat_message_content_part)
 from vllm.inputs import TokensPrompt
-from vllm.model_executor.model_loader import get_model_cls
 from vllm.model_executor.models.interfaces import supports_score_template
 from vllm.multimodal.inputs import MultiModalDataDict
 from vllm.outputs import PoolingRequestOutput
@@ -140,6 +139,8 @@ def apply_score_template(
     prompt_1: str,
     prompt_2: str,
 ) -> str:
+    # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf)
+    from vllm.model_executor.model_loader import get_model_cls
 
     model = get_model_cls(model_config)
     if supports_score_template(model):
@@ -162,6 +163,9 @@ def post_process_tokens(
     Note:
         This is an in-place operation.
     """
+    # NOTE(Simon): lazy import to avoid bring in all dependencies (e.g. gguf)
+    from vllm.model_executor.model_loader import get_model_cls
+
     model = get_model_cls(model_config)
     if supports_score_template(model):
         model.post_process_tokens(prompt)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 857f4bca68245..1058ae140b5b4 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -14,7 +14,6 @@ from pathlib import Path
 from typing import Any, Callable, Optional, Union
 
 import filelock
-import gguf
 import huggingface_hub.constants
 import numpy as np
 import torch
@@ -40,6 +39,11 @@ except (ImportError, OSError):
     SafetensorsStreamer = runai_model_streamer.placeholder_attr(
         "SafetensorsStreamer")
 
+try:
+    import gguf
+except ImportError:
+    gguf = PlaceholderModule("gguf")
+
 try:
     from fastsafetensors import SafeTensorsFileLoader, SingleGroup
 except ImportError: