[Docs] add the parallel sampling usage in LLMEngine and AsyncLLM (#24222)

2026-01-29 12:17:14 +08:00 · 2025-09-18 20:37:08 +09:00 · 2025-09-18 20:37:08 +09:00 · c9ff9e6f0c
commit c9ff9e6f0c
parent eaffe4486c
1 changed files with 7 additions and 1 deletions
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@ -81,7 +81,13 @@ class SamplingParams(
    """

    n: int = 1
-    """Number of output sequences to return for the given prompt."""
+    """Number of outputs to return for the given prompt request.
+
+    NOTE:
+        `AsyncLLM` streams outputs by default. When `n > 1`, all `n` outputs
+        are generated and streamed cumulatively per request. To see all `n`
+        outputs upon completion, use `output_kind=RequestOutputKind.FINAL_ONLY`
+        in `SamplingParams`."""
    best_of: Optional[int] = None
    """Number of output sequences that are generated from the prompt. From
    these `best_of` sequences, the top `n` sequences are returned. `best_of`