mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 02:25:01 +08:00
[doc] fold long code block (#20795)
Signed-off-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
parent
5d09152ff1
commit
6a9e6b2abf
@ -279,64 +279,64 @@ Some models, e.g., [Granite Speech](https://huggingface.co/ibm-granite/granite-s
|
|||||||
|
|
||||||
To this end, we allow registration of default multimodal LoRAs to handle this automatically, where users can map each modality to a LoRA adapter to automatically apply it when the corresponding inputs are present. Note that currently, we only allow one LoRA per prompt; if several modalities are provided, each of which are registered to a given modality, none of them will be applied.
|
To this end, we allow registration of default multimodal LoRAs to handle this automatically, where users can map each modality to a LoRA adapter to automatically apply it when the corresponding inputs are present. Note that currently, we only allow one LoRA per prompt; if several modalities are provided, each of which are registered to a given modality, none of them will be applied.
|
||||||
|
|
||||||
Example usage for offline inference:
|
??? code "Example usage for offline inference"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.assets.audio import AudioAsset
|
from vllm.assets.audio import AudioAsset
|
||||||
|
|
||||||
model_id = "ibm-granite/granite-speech-3.3-2b"
|
model_id = "ibm-granite/granite-speech-3.3-2b"
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
|
||||||
def get_prompt(question: str, has_audio: bool):
|
def get_prompt(question: str, has_audio: bool):
|
||||||
"""Build the input prompt to send to vLLM."""
|
"""Build the input prompt to send to vLLM."""
|
||||||
if has_audio:
|
if has_audio:
|
||||||
question = f"<|audio|>{question}"
|
question = f"<|audio|>{question}"
|
||||||
chat = [
|
chat = [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": question
|
"content": question
|
||||||
|
}
|
||||||
|
]
|
||||||
|
return tokenizer.apply_chat_template(chat, tokenize=False)
|
||||||
|
|
||||||
|
|
||||||
|
model = LLM(
|
||||||
|
model=model_id,
|
||||||
|
enable_lora=True,
|
||||||
|
max_lora_rank=64,
|
||||||
|
max_model_len=2048,
|
||||||
|
limit_mm_per_prompt={"audio": 1},
|
||||||
|
# Will always pass a `LoRARequest` with the `model_id`
|
||||||
|
# whenever audio is contained in the request data.
|
||||||
|
default_mm_loras = {"audio": model_id},
|
||||||
|
enforce_eager=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
question = "can you transcribe the speech into a written format?"
|
||||||
|
prompt_with_audio = get_prompt(
|
||||||
|
question=question,
|
||||||
|
has_audio=True,
|
||||||
|
)
|
||||||
|
audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
"prompt": prompt_with_audio,
|
||||||
|
"multi_modal_data": {
|
||||||
|
"audio": audio,
|
||||||
}
|
}
|
||||||
]
|
|
||||||
return tokenizer.apply_chat_template(chat, tokenize=False)
|
|
||||||
|
|
||||||
|
|
||||||
model = LLM(
|
|
||||||
model=model_id,
|
|
||||||
enable_lora=True,
|
|
||||||
max_lora_rank=64,
|
|
||||||
max_model_len=2048,
|
|
||||||
limit_mm_per_prompt={"audio": 1},
|
|
||||||
# Will always pass a `LoRARequest` with the `model_id`
|
|
||||||
# whenever audio is contained in the request data.
|
|
||||||
default_mm_loras = {"audio": model_id},
|
|
||||||
enforce_eager=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
question = "can you transcribe the speech into a written format?"
|
|
||||||
prompt_with_audio = get_prompt(
|
|
||||||
question=question,
|
|
||||||
has_audio=True,
|
|
||||||
)
|
|
||||||
audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
|
|
||||||
|
|
||||||
inputs = {
|
|
||||||
"prompt": prompt_with_audio,
|
|
||||||
"multi_modal_data": {
|
|
||||||
"audio": audio,
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
outputs = model.generate(
|
outputs = model.generate(
|
||||||
inputs,
|
inputs,
|
||||||
sampling_params=SamplingParams(
|
sampling_params=SamplingParams(
|
||||||
temperature=0.2,
|
temperature=0.2,
|
||||||
max_tokens=64,
|
max_tokens=64,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also pass a json dictionary of `--default-mm-loras` mapping modalities to LoRA model IDs. For example, when starting the server:
|
You can also pass a json dictionary of `--default-mm-loras` mapping modalities to LoRA model IDs. For example, when starting the server:
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user