[misc][core] lazy import outlines (#7831)

2025-12-10 17:05:53 +08:00 · 2024-08-24 00:51:38 -07:00 · 2024-08-24 00:51:38 -07:00 · 7d9ffa2ae1
commit 7d9ffa2ae1
parent d81abefd2e
4 changed files with 64 additions and 7 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -87,7 +87,8 @@ steps:
  commands:
  - pip install -e ./plugins/vllm_add_dummy_model
  - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
-  - pytest -v -s entrypoints/llm
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/openai
 - label: Distributed Tests (4 GPUs) # 10min
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@ -0,0 +1,48 @@
 import sys
 from vllm import LLM, SamplingParams
 def test_lazy_outlines(sample_regex):
    """If users don't use guided decoding, outlines should not be imported.
    """
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
    llm = LLM(model="facebook/opt-125m",
              enforce_eager=True,
              gpu_memory_utilization=0.3)
    outputs = llm.generate(prompts, sampling_params)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    # make sure outlines is not imported
    assert 'outlines' not in sys.modules
    llm = LLM(model="facebook/opt-125m",
              enforce_eager=True,
              guided_decoding_backend="lm-format-enforcer",
              gpu_memory_utilization=0.3)
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
    outputs = llm.generate(
        prompts=[
            f"Give an example IPv4 address with this regex: {sample_regex}"
        ] * 2,
        sampling_params=sampling_params,
        use_tqdm=True,
        guided_options_request=dict(guided_regex=sample_regex))
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    # make sure outlines is not imported
    assert 'outlines' not in sys.modules
--- a/vllm/model_executor/guided_decoding/init.py
+++ b/vllm/model_executor/guided_decoding/init.py
@ -5,9 +5,6 @@ from vllm.entrypoints.openai.protocol import (
    CompletionRequest)
 from vllm.model_executor.guided_decoding.guided_fields import (
    GuidedDecodingRequest)
 from vllm.model_executor.guided_decoding.outlines_decoding import (
    get_local_outlines_guided_decoding_logits_processor,
    get_outlines_guided_decoding_logits_processor)
 from vllm.sampling_params import LogitsProcessor
@ -18,6 +15,9 @@ async def get_guided_decoding_logits_processor(
    request = _adapt_request_for_tool_use(request)
    if guided_decoding_backend == 'outlines':
        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
        from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
            get_outlines_guided_decoding_logits_processor)
        return await get_outlines_guided_decoding_logits_processor(
            request, tokenizer)
    if guided_decoding_backend == 'lm-format-enforcer':
@ -37,6 +37,9 @@ def get_local_guided_decoding_logits_processor(
    # request = _adapt_request_for_tool_use(request)
    if guided_decoding_backend == 'outlines':
        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
        from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
            get_local_outlines_guided_decoding_logits_processor)
        return get_local_outlines_guided_decoding_logits_processor(
            guided_options, tokenizer)
    if guided_decoding_backend == 'lm-format-enforcer':
--- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
+++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
@ -14,9 +14,6 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              CompletionRequest)
 from vllm.model_executor.guided_decoding.guided_fields import (
    GuidedDecodingRequest)
 from vllm.model_executor.guided_decoding.outlines_decoding import (
    get_local_outlines_guided_decoding_logits_processor,
    get_outlines_guided_decoding_logits_processor)
 from vllm.sampling_params import LogitsProcessor
@ -43,6 +40,10 @@ async def get_lm_format_enforcer_guided_decoding_logits_processor(
        character_level_parser = RegexParser(request.guided_regex)
    elif request.guided_grammar:
        # CFG grammar not supported by LMFE, revert to outlines
        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
        from vllm.model_executor.guided_decoding.outlines_decoding import (
            get_outlines_guided_decoding_logits_processor)
        return await get_outlines_guided_decoding_logits_processor(
            request, tokenizer)
    elif (request.response_format is not None
@ -87,6 +88,10 @@ def get_local_lm_format_enforcer_guided_decoding_logits_processor(
        character_level_parser = RegexParser(guided_options.guided_regex)
    elif guided_options.guided_grammar:
        # CFG grammar not supported by LMFE, revert to outlines
        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
        from vllm.model_executor.guided_decoding.outlines_decoding import (
            get_local_outlines_guided_decoding_logits_processor)
        return get_local_outlines_guided_decoding_logits_processor(
            guided_options, tokenizer)
    elif guided_options.guided_json_object: