[doc] fold long code block (#20795)

Signed-off-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
Reid 2025-07-11 14:16:41 +08:00 committed by GitHub
parent 5d09152ff1
commit 6a9e6b2abf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -279,17 +279,17 @@ Some models, e.g., [Granite Speech](https://huggingface.co/ibm-granite/granite-s
To this end, we allow registration of default multimodal LoRAs to handle this automatically, where users can map each modality to a LoRA adapter to automatically apply it when the corresponding inputs are present. Note that currently, we only allow one LoRA per prompt; if several modalities are provided, each of which are registered to a given modality, none of them will be applied. To this end, we allow registration of default multimodal LoRAs to handle this automatically, where users can map each modality to a LoRA adapter to automatically apply it when the corresponding inputs are present. Note that currently, we only allow one LoRA per prompt; if several modalities are provided, each of which are registered to a given modality, none of them will be applied.
Example usage for offline inference: ??? code "Example usage for offline inference"
```python ```python
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
model_id = "ibm-granite/granite-speech-3.3-2b" model_id = "ibm-granite/granite-speech-3.3-2b"
tokenizer = AutoTokenizer.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id)
def get_prompt(question: str, has_audio: bool): def get_prompt(question: str, has_audio: bool):
"""Build the input prompt to send to vLLM.""" """Build the input prompt to send to vLLM."""
if has_audio: if has_audio:
question = f"<|audio|>{question}" question = f"<|audio|>{question}"
@ -302,7 +302,7 @@ def get_prompt(question: str, has_audio: bool):
return tokenizer.apply_chat_template(chat, tokenize=False) return tokenizer.apply_chat_template(chat, tokenize=False)
model = LLM( model = LLM(
model=model_id, model=model_id,
enable_lora=True, enable_lora=True,
max_lora_rank=64, max_lora_rank=64,
@ -312,31 +312,31 @@ model = LLM(
# whenever audio is contained in the request data. # whenever audio is contained in the request data.
default_mm_loras = {"audio": model_id}, default_mm_loras = {"audio": model_id},
enforce_eager=True, enforce_eager=True,
) )
question = "can you transcribe the speech into a written format?" question = "can you transcribe the speech into a written format?"
prompt_with_audio = get_prompt( prompt_with_audio = get_prompt(
question=question, question=question,
has_audio=True, has_audio=True,
) )
audio = AudioAsset("mary_had_lamb").audio_and_sample_rate audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
inputs = { inputs = {
"prompt": prompt_with_audio, "prompt": prompt_with_audio,
"multi_modal_data": { "multi_modal_data": {
"audio": audio, "audio": audio,
} }
} }
outputs = model.generate( outputs = model.generate(
inputs, inputs,
sampling_params=SamplingParams( sampling_params=SamplingParams(
temperature=0.2, temperature=0.2,
max_tokens=64, max_tokens=64,
), ),
) )
``` ```
You can also pass a json dictionary of `--default-mm-loras` mapping modalities to LoRA model IDs. For example, when starting the server: You can also pass a json dictionary of `--default-mm-loras` mapping modalities to LoRA model IDs. For example, when starting the server: