diff --git a/vllm/envs.py b/vllm/envs.py index 861e4c6a1bbe5..70068cca66f8f 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -667,8 +667,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv("VLLM_LORA_RESOLVER_CACHE_DIR", None), - # Enables torch profiler if set. Path to the directory where torch profiler - # traces are saved. Note that it must be an absolute path. + # Enables torch profiler if set. + # Both AsyncLLM's CPU traces as well as workers' + # traces (CPU & GPU) will be saved under this directory. + # Note that it must be an absolute path. "VLLM_TORCH_PROFILER_DIR": lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))), diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 664fec31a4da5..342d7b24f8e98 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,12 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import os +import socket import time from collections.abc import AsyncGenerator, Iterable, Mapping from copy import copy from typing import Any, Optional, Union import numpy as np +import torch import vllm.envs as envs from vllm.config import ModelConfig, VllmConfig @@ -144,6 +147,26 @@ class AsyncLLM(EngineClient): except RuntimeError: pass + if envs.VLLM_TORCH_PROFILER_DIR: + logger.info( + "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s", # noqa: E501 + envs.VLLM_TORCH_PROFILER_DIR) + worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm" + self.profiler = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + ], + with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, + on_trace_ready=torch.profiler.tensorboard_trace_handler( + envs.VLLM_TORCH_PROFILER_DIR, + worker_name=worker_name, + use_gzip=True)) + else: + logger.info( + "Torch profiler disabled. AsyncLLM CPU traces will not be collected." # noqa: E501 + ) + self.profiler = None + @classmethod @deprecate_kwargs( "disable_log_requests", @@ -562,10 +585,16 @@ class AsyncLLM(EngineClient): raise self.dead_error async def start_profile(self) -> None: - await self.engine_core.profile_async(True) + coros = [self.engine_core.profile_async(True)] + if self.profiler is not None: + coros.append(asyncio.to_thread(self.profiler.start)) + await asyncio.gather(*coros) async def stop_profile(self) -> None: - await self.engine_core.profile_async(False) + coros = [self.engine_core.profile_async(False)] + if self.profiler is not None: + coros.append(asyncio.to_thread(self.profiler.stop)) + await asyncio.gather(*coros) async def reset_mm_cache(self) -> None: self.processor.mm_registry.reset_processor_cache(self.model_config)