[DOC] Add additional comments for LLMEngine and AsyncLLMEngine (#1011)

2026-05-27 08:27:55 +08:00 · 2024-01-12 11:26:49 +08:00 · 2024-01-12 11:26:49 +08:00 · 6549aef245
commit 6549aef245
parent 50376faa7b
9 changed files with 242 additions and 15 deletions
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -9,11 +9,15 @@
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
 # import os
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 import os
 import sys
 from sphinx.ext import autodoc
 import logging
 sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
 logger = logging.getLogger(__name__)
 # -- Project information -----------------------------------------------------
@ -21,7 +25,6 @@ project = 'vLLM'
 copyright = '2023, vLLM Team'
 author = 'the vLLM Team'
 # -- General configuration ---------------------------------------------------
 # Add any Sphinx extension module names here, as strings. They can be
@ -32,6 +35,8 @@ extensions = [
    "sphinx.ext.viewcode",
    "sphinx.ext.intersphinx",
    "sphinx_copybutton",
    "sphinx.ext.autodoc",
    "sphinx.ext.autosummary",
 ]
 # Add any paths that contain templates here, relative to this directory.
@ -55,7 +60,6 @@ html_title = project
 html_theme = 'sphinx_book_theme'
 html_logo = 'assets/logos/vllm-logo-text-light.png'
 html_theme_options = {
    'logo_only': True,
    'path_to_docs': 'docs/source',
    'repository_url': 'https://github.com/vllm-project/vllm',
    'use_repository_button': True,
@ -64,4 +68,29 @@ html_theme_options = {
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+# html_static_path = ['_static']
 # Mock out external dependencies here.
 autodoc_mock_imports = [
    "torch", "transformers", "psutil", "aioprometheus", "sentencepiece",
    "vllm.cuda_utils", "vllm._C"
 ]
 for mock_target in autodoc_mock_imports:
    if mock_target in sys.modules:
        logger.info(
            f"Potentially problematic mock target ({mock_target}) found; "
            "autodoc_mock_imports cannot mock modules that have already "
            "been loaded into sys.modules when the sphinx build starts.")
 class MockedClassDocumenter(autodoc.ClassDocumenter):
    """Remove note about base class when a class is derived from object."""
    def add_line(self, line: str, source: str, *lineno: int) -> None:
        if line == "   Bases: :py:class:`object`":
            return
        super().add_line(line, source, *lineno)
 autodoc.ClassDocumenter = MockedClassDocumenter
--- a/docs/source/dev/engine/async_llm_engine.rst
+++ b/docs/source/dev/engine/async_llm_engine.rst
@ -0,0 +1,7 @@
 AsyncLLMEngine
 =================================
 .. autoclass:: vllm.engine.async_llm_engine.AsyncLLMEngine
    :members: generate, abort
    :show-inheritance:
--- a/docs/source/dev/engine/engine_index.rst
+++ b/docs/source/dev/engine/engine_index.rst
@ -0,0 +1,13 @@
 vLLM Engine
 =================================
 .. automodule:: vllm.engine
 .. currentmodule:: vllm.engine
 .. toctree::
   :maxdepth: 2
   :caption: Engines
   llm_engine
   async_llm_engine
--- a/docs/source/dev/engine/llm_engine.rst
+++ b/docs/source/dev/engine/llm_engine.rst
@ -0,0 +1,6 @@
 LLMEngine
 =================================
 .. autoclass:: vllm.engine.llm_engine.LLMEngine
    :members: add_request, abort_request, step, _init_cache
    :show-inheritance:
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -86,3 +86,15 @@ Documentation
   :caption: Quantization
   quantization/auto_awq
 .. toctree::
   :maxdepth: 2
   :caption: Developer Documentation
   dev/engine/engine_index
 Indices and tables
 ==================
 * :ref:`genindex`
 * :ref:`modindex`
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@ -88,6 +88,18 @@ class Scheduler:
        self.waiting.append(seq_group)
    def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
        """Aborts a sequence group with the given ID.
        Check if the sequence group with the given ID
            is present in any of the state queue.
        If present, remove the sequence group from the state queue.
            Also, if any of the sequences in the sequence group is not finished,
                free the sequence with status `FINISHED_ABORTED`.
        Otherwise, do nothing.
        Args:
            request_id: The ID(s) of the sequence group to abort.
        """
        if isinstance(request_id, str):
            request_id = (request_id, )
        request_ids = set(request_id)
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@ -253,7 +253,8 @@ class AsyncLLMEngine:
        log_requests: Whether to log the requests.
        start_engine_loop: If True, the background task to run the engine
            will be automatically started in the generate call.
-        *args, *kwargs: Arguments for LLMEngine.
+        *args: Arguments for LLMEngine.
        *kwargs: Arguments for LLMEngine.
    """
    _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
@ -428,6 +429,49 @@ class AsyncLLMEngine:
        Yields:
            The output `RequestOutput` objects from the LLMEngine for the
            request.
        Details:
            - If the engine is not running, start the background loop,
              which iteratively invokes
              :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
              to process the waiting requests.
            - Add the request to the engine's `RequestTracker`.
              On the next background loop, this request will be sent to
              the underlying engine.
              Also, a corresponding `AsyncStream` will be created.
            - Wait for the request outputs from `AsyncStream` and yield them.
        Example:
            >>> # Please refer to entrypoints/api_server.py for
            >>> # the complete example.
            >>>
            >>> # initialize the engine and the example input
            >>> engine = AsyncLLMEngine.from_engine_args(engine_args)
            >>> example_input = {
            >>>     "prompt": "What is LLM?",
            >>>     "stream": False, # assume the non-streaming case
            >>>     "temperature": 0.0,
            >>>     "request_id": 0,
            >>> }
            >>>
            >>> # start the generation
            >>> results_generator = engine.generate(
            >>>    example_input["prompt"],
            >>>    SamplingParams(temperature=example_input["temperature"]),
            >>>    example_input["request_id"])
            >>>
            >>> # get the results
            >>> final_output = None
            >>> async for request_output in results_generator:
            >>>     if await request.is_disconnected():
            >>>         # Abort the request if the client disconnects.
            >>>         await engine.abort(request_id)
            >>>         # Return or raise an error
            >>>         ...
            >>>     final_output = request_output
            >>>
            >>> # Process and return the final output
            >>> ...
        """
        # Preprocess the request.
        # This should not be used for logging, as it is monotonic time.
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@ -257,7 +257,26 @@ class LLMEngine:
        self.cache_config.verify_with_parallel_config(self.parallel_config)
    def _init_cache(self) -> None:
-        """Profiles the memory usage and initializes the KV cache."""
+        """Profiles the memory usage and initializes the KV cache.
        The engine will first conduct a profiling of the existing memory usage.
        Then, it calculate the maximum possible number of GPU and CPU blocks
        that can be allocated with the remaining free memory.
        More details can be found in the
        :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method
        from class :class:`~vllm.worker.Worker`.
        Afterwards, as there may be multiple workers,
        we take the minimum number of blocks across all workers
        to ensure this can be applied to all of them.
        Finally, the engine will initialize the KV cache
        with the calculated number of blocks.
        .. tip::
            You may limit the usage of GPU memory
            by adjusting the `gpu_memory_utilization` parameters.
        """
        # Get the maximum number of blocks that can be allocated on GPU and CPU.
        num_blocks = self._run_workers(
            "profile_num_available_blocks",
@ -334,6 +353,30 @@ class LLMEngine:
                use the tokenizer to convert the prompts to token IDs.
            arrival_time: The arrival time of the request. If None, we use
                the current monotonic time.
        Details:
            - Set arrival_time to the current time if it is None.
            - Set prompt_token_ids to the encoded prompt if it is None.
            - Create `best_of` number of :class:`~vllm.Sequence` objects.
            - Create a :class:`~vllm.SequenceGroup` object
              from the list of :class:`~vllm.Sequence`.
            - Add the :class:`~vllm.SequenceGroup` object to the scheduler.
        Example:
            >>> # initialize engine
            >>> engine = LLMEngine.from_engine_args(engine_args)
            >>> # set request arguments
            >>> example_prompt = "Who is the president of the United States?"
            >>> sampling_params = SamplingParams(temperature=0.0)
            >>> request_id = 0
            >>>
            >>> # add the request to the engine
            >>> engine.add_request(
            >>>    str(request_id),
            >>>    example_prompt,
            >>>    SamplingParams(temperature=0.0))
            >>> # continue the request processing
            >>> ...
        """
        if arrival_time is None:
            arrival_time = time.monotonic()
@ -358,6 +401,17 @@ class LLMEngine:
        Args:
            request_id: The ID(s) of the request to abort.
        Details:
            - Refer to the
              :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
              from class :class:`~vllm.core.scheduler.Scheduler`.
        Example:
            >>> # initialize engine and add a request with request_id
            >>> request_id = str(0)
            >>> # abort the request
            >>> engine.abort_request(request_id)
        """
        self.scheduler.abort_seq_group(request_id)
@ -617,11 +671,53 @@ class LLMEngine:
    def step(self) -> List[RequestOutput]:
        """Performs one decoding iteration and returns newly generated results.
-        This function performs one decoding iteration of the engine. It first
+        .. figure:: https://i.imgur.com/sv2HssD.png
-        schedules the sequences to be executed in the next iteration and the
+            :alt: Overview of the step function
-        token blocks to be swapped in/out/copy. Then, it executes the model
+            :align: center
-        and updates the scheduler with the model outputs. Finally, it decodes
+
-        the sequences and returns the newly generated results.
+            Overview of the step function.
        Details:
            - Step 1: Schedules the sequences to be executed in the next
              iteration and the token blocks to be swapped in/out/copy.
                - Depending on the scheduling policy,
                  sequences may be `preempted/reordered`.
                - A Sequence Group (SG) refer to a group of sequences
                  that are generated from the same prompt.
            - Step 2: Calls the workers to execute the model.
            - Step 3: Processes the model output. This mainly includes:
                - Decodes the relevant outputs.
                - Updates the scheduled sequence groups with model outputs
                  based on its `sampling parameters` (`use_beam_search` or not).
                - Frees the finished sequence groups.
            - Finally, it creates and returns the newly generated results.
        Example:
            >>> # Please see the example/ folder for more detailed examples.
            >>>
            >>> # initialize engine and request arguments
            >>> engine = LLMEngine.from_engine_args(engine_args)
            >>> example_inputs = [(0, "What is LLM?",
            >>>    SamplingParams(temperature=0.0))]
            >>>
            >>> # Start the engine with an event loop
            >>> while True:
            >>>     if example_inputs:
            >>>         req_id, prompt, sampling_params = example_inputs.pop(0)
            >>>         engine.add_request(str(req_id), prompt, sampling_params)
            >>>
            >>>     # continue the request processing
            >>>     request_outputs = engine.step()
            >>>     for request_output in request_outputs:
            >>>         if request_output.finished:
            >>>             # return or show the request output
            >>>
            >>>     if not (engine.has_unfinished_requests() or example_inputs):
            >>>         break
        """
        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@ -87,6 +87,14 @@ class Worker:
        gpu_memory_utilization: float,
        cpu_swap_space: int,
    ) -> Tuple[int, int]:
        """Profiles the peak memory usage of the model and returns the maximum
        number of GPU and CPU cache blocks that can be allocated.
        Args:
            block_size: The size of the cache block.
            gpu_memory_utilization: The fraction of the total GPU memory to use.
            cpu_swap_space: The size of the CPU swap space in bytes.
        """
        # Profile the memory usage of the model and get the maximum number of
        # cache blocks that can be allocated with the remaining free memory.
        torch.cuda.empty_cache()