diff --git a/docs/source/conf.py b/docs/source/conf.py index d0c64cf53230..44c976468ab0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -9,11 +9,15 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) +import os +import sys +from sphinx.ext import autodoc +import logging + +sys.path.insert(0, os.path.abspath(os.path.join('..', '..'))) + +logger = logging.getLogger(__name__) # -- Project information ----------------------------------------------------- @@ -21,7 +25,6 @@ project = 'vLLM' copyright = '2023, vLLM Team' author = 'the vLLM Team' - # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be @@ -32,6 +35,8 @@ extensions = [ "sphinx.ext.viewcode", "sphinx.ext.intersphinx", "sphinx_copybutton", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", ] # Add any paths that contain templates here, relative to this directory. @@ -55,7 +60,6 @@ html_title = project html_theme = 'sphinx_book_theme' html_logo = 'assets/logos/vllm-logo-text-light.png' html_theme_options = { - 'logo_only': True, 'path_to_docs': 'docs/source', 'repository_url': 'https://github.com/vllm-project/vllm', 'use_repository_button': True, @@ -64,4 +68,29 @@ html_theme_options = { # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +# html_static_path = ['_static'] + +# Mock out external dependencies here. +autodoc_mock_imports = [ + "torch", "transformers", "psutil", "aioprometheus", "sentencepiece", + "vllm.cuda_utils", "vllm._C" +] + +for mock_target in autodoc_mock_imports: + if mock_target in sys.modules: + logger.info( + f"Potentially problematic mock target ({mock_target}) found; " + "autodoc_mock_imports cannot mock modules that have already " + "been loaded into sys.modules when the sphinx build starts.") + + +class MockedClassDocumenter(autodoc.ClassDocumenter): + """Remove note about base class when a class is derived from object.""" + + def add_line(self, line: str, source: str, *lineno: int) -> None: + if line == " Bases: :py:class:`object`": + return + super().add_line(line, source, *lineno) + + +autodoc.ClassDocumenter = MockedClassDocumenter diff --git a/docs/source/dev/engine/async_llm_engine.rst b/docs/source/dev/engine/async_llm_engine.rst new file mode 100644 index 000000000000..47db1e0a401b --- /dev/null +++ b/docs/source/dev/engine/async_llm_engine.rst @@ -0,0 +1,7 @@ + +AsyncLLMEngine +================================= + +.. autoclass:: vllm.engine.async_llm_engine.AsyncLLMEngine + :members: generate, abort + :show-inheritance: diff --git a/docs/source/dev/engine/engine_index.rst b/docs/source/dev/engine/engine_index.rst new file mode 100644 index 000000000000..ba9ae55ddea4 --- /dev/null +++ b/docs/source/dev/engine/engine_index.rst @@ -0,0 +1,13 @@ +vLLM Engine +================================= + +.. automodule:: vllm.engine +.. currentmodule:: vllm.engine + +.. toctree:: + :maxdepth: 2 + :caption: Engines + + llm_engine + async_llm_engine + diff --git a/docs/source/dev/engine/llm_engine.rst b/docs/source/dev/engine/llm_engine.rst new file mode 100644 index 000000000000..b550a9b5faa6 --- /dev/null +++ b/docs/source/dev/engine/llm_engine.rst @@ -0,0 +1,6 @@ +LLMEngine +================================= + +.. autoclass:: vllm.engine.llm_engine.LLMEngine + :members: add_request, abort_request, step, _init_cache + :show-inheritance: \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 816f4f7e2015..321f855645bb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -85,4 +85,16 @@ Documentation :maxdepth: 1 :caption: Quantization - quantization/auto_awq \ No newline at end of file + quantization/auto_awq + +.. toctree:: + :maxdepth: 2 + :caption: Developer Documentation + + dev/engine/engine_index + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 6c169d266905..5c217f6f4de6 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -88,6 +88,18 @@ class Scheduler: self.waiting.append(seq_group) def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None: + """Aborts a sequence group with the given ID. + + Check if the sequence group with the given ID + is present in any of the state queue. + If present, remove the sequence group from the state queue. + Also, if any of the sequences in the sequence group is not finished, + free the sequence with status `FINISHED_ABORTED`. + Otherwise, do nothing. + + Args: + request_id: The ID(s) of the sequence group to abort. + """ if isinstance(request_id, str): request_id = (request_id, ) request_ids = set(request_id) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 92f23ec29bfd..8a5b00ca7f7c 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -253,7 +253,8 @@ class AsyncLLMEngine: log_requests: Whether to log the requests. start_engine_loop: If True, the background task to run the engine will be automatically started in the generate call. - *args, *kwargs: Arguments for LLMEngine. + *args: Arguments for LLMEngine. + *kwargs: Arguments for LLMEngine. """ _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine @@ -428,6 +429,49 @@ class AsyncLLMEngine: Yields: The output `RequestOutput` objects from the LLMEngine for the request. + + Details: + - If the engine is not running, start the background loop, + which iteratively invokes + :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` + to process the waiting requests. + - Add the request to the engine's `RequestTracker`. + On the next background loop, this request will be sent to + the underlying engine. + Also, a corresponding `AsyncStream` will be created. + - Wait for the request outputs from `AsyncStream` and yield them. + + Example: + >>> # Please refer to entrypoints/api_server.py for + >>> # the complete example. + >>> + >>> # initialize the engine and the example input + >>> engine = AsyncLLMEngine.from_engine_args(engine_args) + >>> example_input = { + >>> "prompt": "What is LLM?", + >>> "stream": False, # assume the non-streaming case + >>> "temperature": 0.0, + >>> "request_id": 0, + >>> } + >>> + >>> # start the generation + >>> results_generator = engine.generate( + >>> example_input["prompt"], + >>> SamplingParams(temperature=example_input["temperature"]), + >>> example_input["request_id"]) + >>> + >>> # get the results + >>> final_output = None + >>> async for request_output in results_generator: + >>> if await request.is_disconnected(): + >>> # Abort the request if the client disconnects. + >>> await engine.abort(request_id) + >>> # Return or raise an error + >>> ... + >>> final_output = request_output + >>> + >>> # Process and return the final output + >>> ... """ # Preprocess the request. # This should not be used for logging, as it is monotonic time. diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1920946a31d7..e30bf5db4928 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -257,7 +257,26 @@ class LLMEngine: self.cache_config.verify_with_parallel_config(self.parallel_config) def _init_cache(self) -> None: - """Profiles the memory usage and initializes the KV cache.""" + """Profiles the memory usage and initializes the KV cache. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. + More details can be found in the + :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method + from class :class:`~vllm.worker.Worker`. + + Afterwards, as there may be multiple workers, + we take the minimum number of blocks across all workers + to ensure this can be applied to all of them. + + Finally, the engine will initialize the KV cache + with the calculated number of blocks. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameters. + """ # Get the maximum number of blocks that can be allocated on GPU and CPU. num_blocks = self._run_workers( "profile_num_available_blocks", @@ -334,6 +353,30 @@ class LLMEngine: use the tokenizer to convert the prompts to token IDs. arrival_time: The arrival time of the request. If None, we use the current monotonic time. + + Details: + - Set arrival_time to the current time if it is None. + - Set prompt_token_ids to the encoded prompt if it is None. + - Create `best_of` number of :class:`~vllm.Sequence` objects. + - Create a :class:`~vllm.SequenceGroup` object + from the list of :class:`~vllm.Sequence`. + - Add the :class:`~vllm.SequenceGroup` object to the scheduler. + + Example: + >>> # initialize engine + >>> engine = LLMEngine.from_engine_args(engine_args) + >>> # set request arguments + >>> example_prompt = "Who is the president of the United States?" + >>> sampling_params = SamplingParams(temperature=0.0) + >>> request_id = 0 + >>> + >>> # add the request to the engine + >>> engine.add_request( + >>> str(request_id), + >>> example_prompt, + >>> SamplingParams(temperature=0.0)) + >>> # continue the request processing + >>> ... """ if arrival_time is None: arrival_time = time.monotonic() @@ -358,6 +401,17 @@ class LLMEngine: Args: request_id: The ID(s) of the request to abort. + + Details: + - Refer to the + :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group` + from class :class:`~vllm.core.scheduler.Scheduler`. + + Example: + >>> # initialize engine and add a request with request_id + >>> request_id = str(0) + >>> # abort the request + >>> engine.abort_request(request_id) """ self.scheduler.abort_seq_group(request_id) @@ -617,11 +671,53 @@ class LLMEngine: def step(self) -> List[RequestOutput]: """Performs one decoding iteration and returns newly generated results. - This function performs one decoding iteration of the engine. It first - schedules the sequences to be executed in the next iteration and the - token blocks to be swapped in/out/copy. Then, it executes the model - and updates the scheduler with the model outputs. Finally, it decodes - the sequences and returns the newly generated results. + .. figure:: https://i.imgur.com/sv2HssD.png + :alt: Overview of the step function + :align: center + + Overview of the step function. + + Details: + - Step 1: Schedules the sequences to be executed in the next + iteration and the token blocks to be swapped in/out/copy. + + - Depending on the scheduling policy, + sequences may be `preempted/reordered`. + - A Sequence Group (SG) refer to a group of sequences + that are generated from the same prompt. + + - Step 2: Calls the workers to execute the model. + - Step 3: Processes the model output. This mainly includes: + + - Decodes the relevant outputs. + - Updates the scheduled sequence groups with model outputs + based on its `sampling parameters` (`use_beam_search` or not). + - Frees the finished sequence groups. + + - Finally, it creates and returns the newly generated results. + + Example: + >>> # Please see the example/ folder for more detailed examples. + >>> + >>> # initialize engine and request arguments + >>> engine = LLMEngine.from_engine_args(engine_args) + >>> example_inputs = [(0, "What is LLM?", + >>> SamplingParams(temperature=0.0))] + >>> + >>> # Start the engine with an event loop + >>> while True: + >>> if example_inputs: + >>> req_id, prompt, sampling_params = example_inputs.pop(0) + >>> engine.add_request(str(req_id), prompt, sampling_params) + >>> + >>> # continue the request processing + >>> request_outputs = engine.step() + >>> for request_output in request_outputs: + >>> if request_output.finished: + >>> # return or show the request output + >>> + >>> if not (engine.has_unfinished_requests() or example_inputs): + >>> break """ seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule() diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 6c83f708bd9c..b1d85cc08e92 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -87,6 +87,14 @@ class Worker: gpu_memory_utilization: float, cpu_swap_space: int, ) -> Tuple[int, int]: + """Profiles the peak memory usage of the model and returns the maximum + number of GPU and CPU cache blocks that can be allocated. + + Args: + block_size: The size of the cache block. + gpu_memory_utilization: The fraction of the total GPU memory to use. + cpu_swap_space: The size of the CPU swap space in bytes. + """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache()