[Misc] refactor prompt embedding examples (#18405)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
2026-05-08 01:02:21 +08:00 · 2025-05-20 23:26:12 +08:00 · 2025-05-20 23:26:12 +08:00 · 8f55962a7f
commit 8f55962a7f
parent be48360c1f
3 changed files with 191 additions and 102 deletions
--- a/docs/source/features/prompt_embeds.md
+++ b/docs/source/features/prompt_embeds.md
@ -20,59 +20,7 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.EmbedsPromp
 You can pass prompt embeddings from Hugging Face Transformers models to the  `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
-```python
+<gh-file:examples/offline_inference/prompt_embed_inference.py>
 from vllm import LLM
 import transformers
 model_name = "meta-llama/Llama-3.2-1B-Instruct"
 # Transformers
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
 transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
 llm = LLM(model=model_name, enable_prompt_embeds=True)
 # Refer to the HuggingFace repo for the correct format to use
 chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
 token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')
 embedding_layer = transformers_model.get_input_embeddings()
 prompt_embeds = embedding_layer(token_ids).squeeze(0)
 # Single prompt inference
 outputs = llm.generate({
    "prompt_embeds": prompt_embeds,
 })
 for o in outputs:
    generated_text = o.outputs[0].text
    print(generated_text)
 # Batch inference
 chats = [
    [{"role": "user", "content": "Please tell me about the capital of France."}],
    [{"role": "user", "content": "When is the day longest during the year?"}],
    [{"role": "user", "content": "Where is bigger, the moon or the sun?"}]
 ]
 token_ids_list = [
    tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt') for chat in chats
 ]
 prompt_embeds_list = [embedding_layer(token_ids).squeeze(0) for token_ids in token_ids_list]
 outputs = llm.generate(
    [
        {
            "prompt_embeds": prompt_embeds,
        } for prompt_embeds in prompt_embeds_list
    ]
 )
 for o in outputs:
    generated_text = o.outputs[0].text
    print(generated_text)
 ```
 ## Online Serving
@ -93,52 +41,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \
 Then, you can use the OpenAI client as follows:
-```python
+<gh-file:examples/online_serving/prompt_embed_inference_with_openai_client.py>
 from openai import OpenAI
 import transformers
 import torch
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
 )
 model_name = "meta-llama/Llama-3.2-1B-Instruct"
 # Transformers
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
 transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
 # Refer to the HuggingFace repo for the correct format to use
 chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
 token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')
 embedding_layer = transformers_model.get_input_embeddings()
 prompt_embeds = embedding_layer(token_ids).squeeze(0)
 # Prompt embeddings
 buffer = io.BytesIO()
 torch.save(prompt_embeds, buffer)
 buffer.seek(0)
 binary_data = buffer.read()
 encoded_embeds = base64.b64encode(binary_data).decode('utf-8')
 completion = client_with_prompt_embeds.completions.create(
    model=model_name,
    # NOTE: The OpenAI client does not allow `None` as an input to 
    # `prompt`. Use an empty string if you have no text prompts.
    prompt="",  
    max_tokens=5,
    temperature=0.0,
    # NOTE: The OpenAI client allows passing in extra JSON body via the
    # `extra_body` argument.
    extra_body={"prompt_embeds": encoded_embeds}
 )
 print(completion.choices[0].text)
 ```
--- a/examples/offline_inference/prompt_embed_inference.py
+++ b/examples/offline_inference/prompt_embed_inference.py
@ -0,0 +1,103 @@
 # SPDX-License-Identifier: Apache-2.0
 """
 Demonstrates how to generate prompt embeddings using
 Hugging Face Transformers  and use them as input to vLLM
 for both single and batch inference.
 Model: meta-llama/Llama-3.2-1B-Instruct
 Note: This model is gated on Hugging Face Hub.
      You must request access to use it:
      https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
 Requirements:
 - vLLM
 - transformers
 Run:
    python examples/offline_inference/prompt_embed_inference.py
 """
 import torch
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizer)
 from vllm import LLM
 def init_tokenizer_and_llm(model_name: str):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
    embedding_layer = transformers_model.get_input_embeddings()
    llm = LLM(model=model_name, enable_prompt_embeds=True)
    return tokenizer, embedding_layer, llm
 def get_prompt_embeds(chat: list[dict[str,
                                      str]], tokenizer: PreTrainedTokenizer,
                      embedding_layer: torch.nn.Module):
    token_ids = tokenizer.apply_chat_template(chat,
                                              add_generation_prompt=True,
                                              return_tensors='pt')
    prompt_embeds = embedding_layer(token_ids).squeeze(0)
    return prompt_embeds
 def single_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer,
                            embedding_layer: torch.nn.Module):
    chat = [{
        "role": "user",
        "content": "Please tell me about the capital of France."
    }]
    prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
    outputs = llm.generate({
        "prompt_embeds": prompt_embeds,
    })
    print("\n[Single Inference Output]")
    print("-" * 30)
    for o in outputs:
        print(o.outputs[0].text)
    print("-" * 30)
 def batch_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer,
                           embedding_layer: torch.nn.Module):
    chats = [[{
        "role": "user",
        "content": "Please tell me about the capital of France."
    }],
             [{
                 "role": "user",
                 "content": "When is the day longest during the year?"
             }],
             [{
                 "role": "user",
                 "content": "Where is bigger, the moon or the sun?"
             }]]
    prompt_embeds_list = [
        get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
    ]
    outputs = llm.generate([{
        "prompt_embeds": embeds
    } for embeds in prompt_embeds_list])
    print("\n[Batch Inference Outputs]")
    print("-" * 30)
    for i, o in enumerate(outputs):
        print(f"Q{i+1}: {chats[i][0]['content']}")
        print(f"A{i+1}: {o.outputs[0].text}\n")
    print("-" * 30)
 def main():
    model_name = "meta-llama/Llama-3.2-1B-Instruct"
    tokenizer, embedding_layer, llm = init_tokenizer_and_llm(model_name)
    single_prompt_inference(llm, tokenizer, embedding_layer)
    batch_prompt_inference(llm, tokenizer, embedding_layer)
 if __name__ == "__main__":
    main()
--- a/examples/online_serving/prompt_embed_inference_with_openai_client.py
+++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py
@ -0,0 +1,86 @@
 # SPDX-License-Identifier: Apache-2.0
 """
 vLLM OpenAI-Compatible Client with Prompt Embeddings
 This script demonstrates how to:
 1. Generate prompt embeddings using Hugging Face Transformers
 2. Encode them in base64 format
 3. Send them to a vLLM server via the OpenAI-compatible Completions API
 Run the vLLM server first:
 vllm serve meta-llama/Llama-3.2-1B-Instruct \
  --task generate \
  --max-model-len 4096 \
  --enable-prompt-embeds
 Run the client:
 python examples/online_serving/prompt_embed_inference_with_openai_client.py
 Model: meta-llama/Llama-3.2-1B-Instruct
 Note: This model is gated on Hugging Face Hub.
      You must request access to use it:
      https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
 Dependencies:
 - transformers
 - torch
 - openai
 """
 import base64
 import io
 import torch
 import transformers
 from openai import OpenAI
 def main():
    client = OpenAI(
        api_key="EMPTY",
        base_url="http://localhost:8000/v1",
    )
    model_name = "meta-llama/Llama-3.2-1B-Instruct"
    # Transformers
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    transformers_model = transformers.AutoModelForCausalLM.from_pretrained(
        model_name)
    # Refer to the HuggingFace repo for the correct format to use
    chat = [{
        "role": "user",
        "content": "Please tell me about the capital of France."
    }]
    token_ids = tokenizer.apply_chat_template(chat,
                                              add_generation_prompt=True,
                                              return_tensors='pt')
    embedding_layer = transformers_model.get_input_embeddings()
    prompt_embeds = embedding_layer(token_ids).squeeze(0)
    # Prompt embeddings
    buffer = io.BytesIO()
    torch.save(prompt_embeds, buffer)
    buffer.seek(0)
    binary_data = buffer.read()
    encoded_embeds = base64.b64encode(binary_data).decode('utf-8')
    completion = client.completions.create(
        model=model_name,
        # NOTE: The OpenAI client does not allow `None` as an input to
        # `prompt`. Use an empty string if you have no text prompts.
        prompt="",
        max_tokens=5,
        temperature=0.0,
        # NOTE: The OpenAI client allows passing in extra JSON body via the
        # `extra_body` argument.
        extra_body={"prompt_embeds": encoded_embeds})
    print("-" * 30)
    print(completion.choices[0].text)
    print("-" * 30)
 if __name__ == "__main__":
    main()