[doc] fold long code block (#20795)

Signed-off-by: reidliu41 <reid201711@gmail.com>
2025-12-10 02:25:01 +08:00 · 2025-07-11 14:16:41 +08:00 · 2025-07-11 14:16:41 +08:00 · 6a9e6b2abf
commit 6a9e6b2abf
parent 5d09152ff1
1 changed files with 51 additions and 51 deletions
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@ -279,64 +279,64 @@ Some models, e.g., [Granite Speech](https://huggingface.co/ibm-granite/granite-s
 To this end, we allow registration of default multimodal LoRAs to handle this automatically, where users can map each modality to a LoRA adapter to automatically apply it when the corresponding inputs are present. Note that currently, we only allow one LoRA per prompt; if several modalities are provided, each of which are registered to a given modality, none of them will be applied.
-Example usage for offline inference:
+??? code "Example usage for offline inference"
-```python
+    ```python
-from transformers import AutoTokenizer
+    from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
+    from vllm import LLM, SamplingParams
-from vllm.assets.audio import AudioAsset
+    from vllm.assets.audio import AudioAsset
-model_id = "ibm-granite/granite-speech-3.3-2b"
+    model_id = "ibm-granite/granite-speech-3.3-2b"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
-def get_prompt(question: str, has_audio: bool):
+    def get_prompt(question: str, has_audio: bool):
-    """Build the input prompt to send to vLLM."""
+        """Build the input prompt to send to vLLM."""
-    if has_audio:
+        if has_audio:
-        question = f"<|audio|>{question}"
+            question = f"<|audio|>{question}"
-    chat = [
+        chat = [
-        {
+            {
-            "role": "user",
+                "role": "user",
-            "content": question
+                "content": question
            }
        ]
        return tokenizer.apply_chat_template(chat, tokenize=False)
    model = LLM(
        model=model_id,
        enable_lora=True,
        max_lora_rank=64,
        max_model_len=2048,
        limit_mm_per_prompt={"audio": 1},
        # Will always pass a `LoRARequest` with the `model_id`
        # whenever audio is contained in the request data.
        default_mm_loras = {"audio": model_id},
        enforce_eager=True,
    )
    question = "can you transcribe the speech into a written format?"
    prompt_with_audio = get_prompt(
        question=question,
        has_audio=True,
    )
    audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
    inputs = {
        "prompt": prompt_with_audio,
        "multi_modal_data": {
            "audio": audio,
        }
    ]
    return tokenizer.apply_chat_template(chat, tokenize=False)
 model = LLM(
    model=model_id,
    enable_lora=True,
    max_lora_rank=64,
    max_model_len=2048,
    limit_mm_per_prompt={"audio": 1},
    # Will always pass a `LoRARequest` with the `model_id`
    # whenever audio is contained in the request data.
    default_mm_loras = {"audio": model_id},
    enforce_eager=True,
 )
 question = "can you transcribe the speech into a written format?"
 prompt_with_audio = get_prompt(
    question=question,
    has_audio=True,
 )
 audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
 inputs = {
    "prompt": prompt_with_audio,
    "multi_modal_data": {
        "audio": audio,
    }
 }
-outputs = model.generate(
+    outputs = model.generate(
-    inputs,
+        inputs,
-    sampling_params=SamplingParams(
+        sampling_params=SamplingParams(
-        temperature=0.2,
+            temperature=0.2,
-        max_tokens=64,
+            max_tokens=64,
-    ),
+        ),
-)
+    )
-```
+    ```
 You can also pass a json dictionary of `--default-mm-loras` mapping modalities to LoRA model IDs. For example, when starting the server: