diff --git a/docs/features/lora.md b/docs/features/lora.md index d72c0bb4160c..6acfdcce4458 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -279,64 +279,64 @@ Some models, e.g., [Granite Speech](https://huggingface.co/ibm-granite/granite-s To this end, we allow registration of default multimodal LoRAs to handle this automatically, where users can map each modality to a LoRA adapter to automatically apply it when the corresponding inputs are present. Note that currently, we only allow one LoRA per prompt; if several modalities are provided, each of which are registered to a given modality, none of them will be applied. -Example usage for offline inference: +??? code "Example usage for offline inference" -```python -from transformers import AutoTokenizer -from vllm import LLM, SamplingParams -from vllm.assets.audio import AudioAsset + ```python + from transformers import AutoTokenizer + from vllm import LLM, SamplingParams + from vllm.assets.audio import AudioAsset -model_id = "ibm-granite/granite-speech-3.3-2b" -tokenizer = AutoTokenizer.from_pretrained(model_id) + model_id = "ibm-granite/granite-speech-3.3-2b" + tokenizer = AutoTokenizer.from_pretrained(model_id) -def get_prompt(question: str, has_audio: bool): - """Build the input prompt to send to vLLM.""" - if has_audio: - question = f"<|audio|>{question}" - chat = [ - { - "role": "user", - "content": question + def get_prompt(question: str, has_audio: bool): + """Build the input prompt to send to vLLM.""" + if has_audio: + question = f"<|audio|>{question}" + chat = [ + { + "role": "user", + "content": question + } + ] + return tokenizer.apply_chat_template(chat, tokenize=False) + + + model = LLM( + model=model_id, + enable_lora=True, + max_lora_rank=64, + max_model_len=2048, + limit_mm_per_prompt={"audio": 1}, + # Will always pass a `LoRARequest` with the `model_id` + # whenever audio is contained in the request data. + default_mm_loras = {"audio": model_id}, + enforce_eager=True, + ) + + question = "can you transcribe the speech into a written format?" + prompt_with_audio = get_prompt( + question=question, + has_audio=True, + ) + audio = AudioAsset("mary_had_lamb").audio_and_sample_rate + + inputs = { + "prompt": prompt_with_audio, + "multi_modal_data": { + "audio": audio, } - ] - return tokenizer.apply_chat_template(chat, tokenize=False) - - -model = LLM( - model=model_id, - enable_lora=True, - max_lora_rank=64, - max_model_len=2048, - limit_mm_per_prompt={"audio": 1}, - # Will always pass a `LoRARequest` with the `model_id` - # whenever audio is contained in the request data. - default_mm_loras = {"audio": model_id}, - enforce_eager=True, -) - -question = "can you transcribe the speech into a written format?" -prompt_with_audio = get_prompt( - question=question, - has_audio=True, -) -audio = AudioAsset("mary_had_lamb").audio_and_sample_rate - -inputs = { - "prompt": prompt_with_audio, - "multi_modal_data": { - "audio": audio, } -} -outputs = model.generate( - inputs, - sampling_params=SamplingParams( - temperature=0.2, - max_tokens=64, - ), -) -``` + outputs = model.generate( + inputs, + sampling_params=SamplingParams( + temperature=0.2, + max_tokens=64, + ), + ) + ``` You can also pass a json dictionary of `--default-mm-loras` mapping modalities to LoRA model IDs. For example, when starting the server: