mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-08 01:02:21 +08:00
[Misc] refactor prompt embedding examples (#18405)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
parent
be48360c1f
commit
8f55962a7f
@ -20,59 +20,7 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.EmbedsPromp
|
|||||||
|
|
||||||
You can pass prompt embeddings from Hugging Face Transformers models to the `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
|
You can pass prompt embeddings from Hugging Face Transformers models to the `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
|
||||||
|
|
||||||
```python
|
<gh-file:examples/offline_inference/prompt_embed_inference.py>
|
||||||
from vllm import LLM
|
|
||||||
import transformers
|
|
||||||
|
|
||||||
model_name = "meta-llama/Llama-3.2-1B-Instruct"
|
|
||||||
|
|
||||||
# Transformers
|
|
||||||
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
|
|
||||||
transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
|
|
||||||
|
|
||||||
llm = LLM(model=model_name, enable_prompt_embeds=True)
|
|
||||||
|
|
||||||
# Refer to the HuggingFace repo for the correct format to use
|
|
||||||
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
|
|
||||||
token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')
|
|
||||||
|
|
||||||
embedding_layer = transformers_model.get_input_embeddings()
|
|
||||||
prompt_embeds = embedding_layer(token_ids).squeeze(0)
|
|
||||||
|
|
||||||
# Single prompt inference
|
|
||||||
outputs = llm.generate({
|
|
||||||
"prompt_embeds": prompt_embeds,
|
|
||||||
})
|
|
||||||
|
|
||||||
for o in outputs:
|
|
||||||
generated_text = o.outputs[0].text
|
|
||||||
print(generated_text)
|
|
||||||
|
|
||||||
# Batch inference
|
|
||||||
|
|
||||||
chats = [
|
|
||||||
[{"role": "user", "content": "Please tell me about the capital of France."}],
|
|
||||||
[{"role": "user", "content": "When is the day longest during the year?"}],
|
|
||||||
[{"role": "user", "content": "Where is bigger, the moon or the sun?"}]
|
|
||||||
]
|
|
||||||
|
|
||||||
token_ids_list = [
|
|
||||||
tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt') for chat in chats
|
|
||||||
]
|
|
||||||
prompt_embeds_list = [embedding_layer(token_ids).squeeze(0) for token_ids in token_ids_list]
|
|
||||||
|
|
||||||
outputs = llm.generate(
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"prompt_embeds": prompt_embeds,
|
|
||||||
} for prompt_embeds in prompt_embeds_list
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
for o in outputs:
|
|
||||||
generated_text = o.outputs[0].text
|
|
||||||
print(generated_text)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Online Serving
|
## Online Serving
|
||||||
|
|
||||||
@ -93,52 +41,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \
|
|||||||
|
|
||||||
Then, you can use the OpenAI client as follows:
|
Then, you can use the OpenAI client as follows:
|
||||||
|
|
||||||
```python
|
<gh-file:examples/online_serving/prompt_embed_inference_with_openai_client.py>
|
||||||
from openai import OpenAI
|
|
||||||
import transformers
|
|
||||||
import torch
|
|
||||||
|
|
||||||
openai_api_key = "EMPTY"
|
|
||||||
openai_api_base = "http://localhost:8000/v1"
|
|
||||||
|
|
||||||
client = OpenAI(
|
|
||||||
api_key=openai_api_key,
|
|
||||||
base_url=openai_api_base,
|
|
||||||
)
|
|
||||||
|
|
||||||
model_name = "meta-llama/Llama-3.2-1B-Instruct"
|
|
||||||
|
|
||||||
# Transformers
|
|
||||||
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
|
|
||||||
transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
|
|
||||||
|
|
||||||
|
|
||||||
# Refer to the HuggingFace repo for the correct format to use
|
|
||||||
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
|
|
||||||
token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')
|
|
||||||
|
|
||||||
embedding_layer = transformers_model.get_input_embeddings()
|
|
||||||
prompt_embeds = embedding_layer(token_ids).squeeze(0)
|
|
||||||
|
|
||||||
# Prompt embeddings
|
|
||||||
buffer = io.BytesIO()
|
|
||||||
torch.save(prompt_embeds, buffer)
|
|
||||||
buffer.seek(0)
|
|
||||||
binary_data = buffer.read()
|
|
||||||
encoded_embeds = base64.b64encode(binary_data).decode('utf-8')
|
|
||||||
|
|
||||||
|
|
||||||
completion = client_with_prompt_embeds.completions.create(
|
|
||||||
model=model_name,
|
|
||||||
# NOTE: The OpenAI client does not allow `None` as an input to
|
|
||||||
# `prompt`. Use an empty string if you have no text prompts.
|
|
||||||
prompt="",
|
|
||||||
max_tokens=5,
|
|
||||||
temperature=0.0,
|
|
||||||
# NOTE: The OpenAI client allows passing in extra JSON body via the
|
|
||||||
# `extra_body` argument.
|
|
||||||
extra_body={"prompt_embeds": encoded_embeds}
|
|
||||||
)
|
|
||||||
|
|
||||||
print(completion.choices[0].text)
|
|
||||||
```
|
|
||||||
|
|||||||
103
examples/offline_inference/prompt_embed_inference.py
Normal file
103
examples/offline_inference/prompt_embed_inference.py
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
"""
|
||||||
|
Demonstrates how to generate prompt embeddings using
|
||||||
|
Hugging Face Transformers and use them as input to vLLM
|
||||||
|
for both single and batch inference.
|
||||||
|
|
||||||
|
Model: meta-llama/Llama-3.2-1B-Instruct
|
||||||
|
Note: This model is gated on Hugging Face Hub.
|
||||||
|
You must request access to use it:
|
||||||
|
https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
- vLLM
|
||||||
|
- transformers
|
||||||
|
|
||||||
|
Run:
|
||||||
|
python examples/offline_inference/prompt_embed_inference.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||||
|
PreTrainedTokenizer)
|
||||||
|
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
|
|
||||||
|
def init_tokenizer_and_llm(model_name: str):
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
|
||||||
|
embedding_layer = transformers_model.get_input_embeddings()
|
||||||
|
llm = LLM(model=model_name, enable_prompt_embeds=True)
|
||||||
|
return tokenizer, embedding_layer, llm
|
||||||
|
|
||||||
|
|
||||||
|
def get_prompt_embeds(chat: list[dict[str,
|
||||||
|
str]], tokenizer: PreTrainedTokenizer,
|
||||||
|
embedding_layer: torch.nn.Module):
|
||||||
|
token_ids = tokenizer.apply_chat_template(chat,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
return_tensors='pt')
|
||||||
|
prompt_embeds = embedding_layer(token_ids).squeeze(0)
|
||||||
|
return prompt_embeds
|
||||||
|
|
||||||
|
|
||||||
|
def single_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer,
|
||||||
|
embedding_layer: torch.nn.Module):
|
||||||
|
chat = [{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Please tell me about the capital of France."
|
||||||
|
}]
|
||||||
|
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
|
||||||
|
|
||||||
|
outputs = llm.generate({
|
||||||
|
"prompt_embeds": prompt_embeds,
|
||||||
|
})
|
||||||
|
|
||||||
|
print("\n[Single Inference Output]")
|
||||||
|
print("-" * 30)
|
||||||
|
for o in outputs:
|
||||||
|
print(o.outputs[0].text)
|
||||||
|
print("-" * 30)
|
||||||
|
|
||||||
|
|
||||||
|
def batch_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer,
|
||||||
|
embedding_layer: torch.nn.Module):
|
||||||
|
chats = [[{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Please tell me about the capital of France."
|
||||||
|
}],
|
||||||
|
[{
|
||||||
|
"role": "user",
|
||||||
|
"content": "When is the day longest during the year?"
|
||||||
|
}],
|
||||||
|
[{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Where is bigger, the moon or the sun?"
|
||||||
|
}]]
|
||||||
|
|
||||||
|
prompt_embeds_list = [
|
||||||
|
get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
|
||||||
|
]
|
||||||
|
|
||||||
|
outputs = llm.generate([{
|
||||||
|
"prompt_embeds": embeds
|
||||||
|
} for embeds in prompt_embeds_list])
|
||||||
|
|
||||||
|
print("\n[Batch Inference Outputs]")
|
||||||
|
print("-" * 30)
|
||||||
|
for i, o in enumerate(outputs):
|
||||||
|
print(f"Q{i+1}: {chats[i][0]['content']}")
|
||||||
|
print(f"A{i+1}: {o.outputs[0].text}\n")
|
||||||
|
print("-" * 30)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
model_name = "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
|
tokenizer, embedding_layer, llm = init_tokenizer_and_llm(model_name)
|
||||||
|
single_prompt_inference(llm, tokenizer, embedding_layer)
|
||||||
|
batch_prompt_inference(llm, tokenizer, embedding_layer)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -0,0 +1,86 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
"""
|
||||||
|
vLLM OpenAI-Compatible Client with Prompt Embeddings
|
||||||
|
|
||||||
|
This script demonstrates how to:
|
||||||
|
1. Generate prompt embeddings using Hugging Face Transformers
|
||||||
|
2. Encode them in base64 format
|
||||||
|
3. Send them to a vLLM server via the OpenAI-compatible Completions API
|
||||||
|
|
||||||
|
Run the vLLM server first:
|
||||||
|
vllm serve meta-llama/Llama-3.2-1B-Instruct \
|
||||||
|
--task generate \
|
||||||
|
--max-model-len 4096 \
|
||||||
|
--enable-prompt-embeds
|
||||||
|
|
||||||
|
Run the client:
|
||||||
|
python examples/online_serving/prompt_embed_inference_with_openai_client.py
|
||||||
|
|
||||||
|
Model: meta-llama/Llama-3.2-1B-Instruct
|
||||||
|
Note: This model is gated on Hugging Face Hub.
|
||||||
|
You must request access to use it:
|
||||||
|
https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
|
||||||
|
|
||||||
|
Dependencies:
|
||||||
|
- transformers
|
||||||
|
- torch
|
||||||
|
- openai
|
||||||
|
"""
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import transformers
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
client = OpenAI(
|
||||||
|
api_key="EMPTY",
|
||||||
|
base_url="http://localhost:8000/v1",
|
||||||
|
)
|
||||||
|
|
||||||
|
model_name = "meta-llama/Llama-3.2-1B-Instruct"
|
||||||
|
|
||||||
|
# Transformers
|
||||||
|
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
|
||||||
|
transformers_model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name)
|
||||||
|
|
||||||
|
# Refer to the HuggingFace repo for the correct format to use
|
||||||
|
chat = [{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Please tell me about the capital of France."
|
||||||
|
}]
|
||||||
|
token_ids = tokenizer.apply_chat_template(chat,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
return_tensors='pt')
|
||||||
|
|
||||||
|
embedding_layer = transformers_model.get_input_embeddings()
|
||||||
|
prompt_embeds = embedding_layer(token_ids).squeeze(0)
|
||||||
|
|
||||||
|
# Prompt embeddings
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
torch.save(prompt_embeds, buffer)
|
||||||
|
buffer.seek(0)
|
||||||
|
binary_data = buffer.read()
|
||||||
|
encoded_embeds = base64.b64encode(binary_data).decode('utf-8')
|
||||||
|
|
||||||
|
completion = client.completions.create(
|
||||||
|
model=model_name,
|
||||||
|
# NOTE: The OpenAI client does not allow `None` as an input to
|
||||||
|
# `prompt`. Use an empty string if you have no text prompts.
|
||||||
|
prompt="",
|
||||||
|
max_tokens=5,
|
||||||
|
temperature=0.0,
|
||||||
|
# NOTE: The OpenAI client allows passing in extra JSON body via the
|
||||||
|
# `extra_body` argument.
|
||||||
|
extra_body={"prompt_embeds": encoded_embeds})
|
||||||
|
|
||||||
|
print("-" * 30)
|
||||||
|
print(completion.choices[0].text)
|
||||||
|
print("-" * 30)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
x
Reference in New Issue
Block a user