mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-05 04:43:59 +08:00
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
200 lines
7.4 KiB
Python
200 lines
7.4 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import os
|
|
from typing import Any, Literal
|
|
|
|
from pydantic import Field, model_validator
|
|
from pydantic.dataclasses import dataclass
|
|
from typing_extensions import Self
|
|
|
|
import vllm.envs as envs
|
|
from vllm.config.utils import config
|
|
from vllm.logger import init_logger
|
|
from vllm.utils.hashing import safe_hash
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
ProfilerKind = Literal["torch", "cuda"]
|
|
|
|
|
|
@config
|
|
@dataclass
|
|
class ProfilerConfig:
|
|
"""Dataclass which contains profiler config for the engine."""
|
|
|
|
profiler: ProfilerKind | None = None
|
|
"""Which profiler to use. Defaults to None. Options are:
|
|
|
|
- 'torch': Use PyTorch profiler.\n
|
|
- 'cuda': Use CUDA profiler."""
|
|
|
|
torch_profiler_dir: str = ""
|
|
"""Directory to save torch profiler traces. Both AsyncLLM's CPU traces and
|
|
worker's traces (CPU & GPU) will be saved under this directory. Note that
|
|
it must be an absolute path."""
|
|
|
|
torch_profiler_with_stack: bool = True
|
|
"""If `True`, enables stack tracing in the torch profiler. Enabled by default."""
|
|
|
|
torch_profiler_with_flops: bool = False
|
|
"""If `True`, enables FLOPS counting in the torch profiler. Disabled by default."""
|
|
|
|
torch_profiler_use_gzip: bool = True
|
|
"""If `True`, saves torch profiler traces in gzip format. Enabled by default"""
|
|
|
|
torch_profiler_dump_cuda_time_total: bool = True
|
|
"""If `True`, dumps total CUDA time in torch profiler traces. Enabled by default."""
|
|
|
|
torch_profiler_record_shapes: bool = False
|
|
"""If `True`, records tensor shapes in the torch profiler. Disabled by default."""
|
|
|
|
torch_profiler_with_memory: bool = False
|
|
"""If `True`, enables memory profiling in the torch profiler.
|
|
Disabled by default."""
|
|
|
|
ignore_frontend: bool = False
|
|
"""If `True`, disables the front-end profiling of AsyncLLM when using the
|
|
'torch' profiler. This is needed to reduce overhead when using delay/limit options,
|
|
since the front-end profiling does not track iterations and will capture the
|
|
entire range.
|
|
"""
|
|
|
|
delay_iterations: int = Field(default=0, ge=0)
|
|
"""Number of engine iterations to skip before starting profiling.
|
|
Defaults to 0, meaning profiling starts immediately after receiving /start_profile.
|
|
"""
|
|
|
|
max_iterations: int = Field(default=0, ge=0)
|
|
"""Maximum number of engine iterations to profile after starting profiling.
|
|
Defaults to 0, meaning no limit.
|
|
"""
|
|
|
|
def compute_hash(self) -> str:
|
|
"""
|
|
WARNING: Whenever a new field is added to this config,
|
|
ensure that it is included in the factors list if
|
|
it affects the computation graph.
|
|
|
|
Provide a hash that uniquely identifies all the configs
|
|
that affect the structure of the computation
|
|
graph from input ids/embeddings to the final hidden states,
|
|
excluding anything before input ids/embeddings and after
|
|
the final hidden states.
|
|
"""
|
|
# no factors to consider.
|
|
# this config will not affect the computation graph.
|
|
factors: list[Any] = []
|
|
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
|
return hash_str
|
|
|
|
def _get_from_env_if_set(self, field_name: str, env_var_name: str) -> None:
|
|
"""Get field from env var if set, with deprecation warning."""
|
|
|
|
if envs.is_set(env_var_name):
|
|
value = getattr(envs, env_var_name)
|
|
logger.warning_once(
|
|
"Using %s environment variable is deprecated and will be removed in "
|
|
"v0.14.0 or v1.0.0, whichever is soonest. Please use "
|
|
"--profiler-config.%s command line argument or "
|
|
"ProfilerConfig(%s=...) config field instead.",
|
|
env_var_name,
|
|
field_name,
|
|
field_name,
|
|
)
|
|
return value
|
|
return None
|
|
|
|
def _set_from_env_if_set(
|
|
self,
|
|
field_name: str,
|
|
env_var_name: str,
|
|
to_bool: bool = True,
|
|
to_int: bool = False,
|
|
) -> None:
|
|
"""Set field from env var if set, with deprecation warning."""
|
|
value = self._get_from_env_if_set(field_name, env_var_name)
|
|
if value is not None:
|
|
if to_bool:
|
|
value = value == "1"
|
|
if to_int:
|
|
value = int(value)
|
|
setattr(self, field_name, value)
|
|
|
|
@model_validator(mode="after")
|
|
def _validate_profiler_config(self) -> Self:
|
|
maybe_use_cuda_profiler = self._get_from_env_if_set(
|
|
"profiler", "VLLM_TORCH_CUDA_PROFILE"
|
|
)
|
|
if maybe_use_cuda_profiler is not None:
|
|
self.profiler = "cuda" if maybe_use_cuda_profiler == "1" else None
|
|
else:
|
|
self._set_from_env_if_set(
|
|
"torch_profiler_dir", "VLLM_TORCH_PROFILER_DIR", to_bool=False
|
|
)
|
|
if self.torch_profiler_dir:
|
|
self.profiler = "torch"
|
|
self._set_from_env_if_set(
|
|
"torch_profiler_record_shapes",
|
|
"VLLM_TORCH_PROFILER_RECORD_SHAPES",
|
|
)
|
|
self._set_from_env_if_set(
|
|
"torch_profiler_with_memory",
|
|
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY",
|
|
)
|
|
self._set_from_env_if_set(
|
|
"torch_profiler_with_stack",
|
|
"VLLM_TORCH_PROFILER_WITH_STACK",
|
|
)
|
|
self._set_from_env_if_set(
|
|
"torch_profiler_with_flops",
|
|
"VLLM_TORCH_PROFILER_WITH_FLOPS",
|
|
)
|
|
self._set_from_env_if_set(
|
|
"ignore_frontend",
|
|
"VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM",
|
|
)
|
|
self._set_from_env_if_set(
|
|
"torch_profiler_use_gzip",
|
|
"VLLM_TORCH_PROFILER_USE_GZIP",
|
|
)
|
|
self._set_from_env_if_set(
|
|
"torch_profiler_dump_cuda_time_total",
|
|
"VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL",
|
|
)
|
|
|
|
self._set_from_env_if_set(
|
|
"delay_iterations", "VLLM_PROFILER_DELAY_ITERS", to_bool=False, to_int=True
|
|
)
|
|
self._set_from_env_if_set(
|
|
"max_iterations", "VLLM_PROFILER_MAX_ITERS", to_bool=False, to_int=True
|
|
)
|
|
|
|
has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0
|
|
if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend:
|
|
logger.warning_once(
|
|
"Using 'torch' profiler with delay_iterations or max_iterations "
|
|
"while ignore_frontend is False may result in high overhead."
|
|
)
|
|
|
|
profiler_dir = self.torch_profiler_dir
|
|
if profiler_dir and self.profiler != "torch":
|
|
raise ValueError(
|
|
"torch_profiler_dir is only applicable when profiler is set to 'torch'"
|
|
)
|
|
if self.profiler == "torch" and not profiler_dir:
|
|
raise ValueError("torch_profiler_dir must be set when profiler is 'torch'")
|
|
|
|
if profiler_dir:
|
|
is_gs_path = (
|
|
profiler_dir.startswith("gs://")
|
|
and profiler_dir[5:]
|
|
and profiler_dir[5] != "/"
|
|
)
|
|
if not is_gs_path:
|
|
self.torch_profiler_dir = os.path.abspath(
|
|
os.path.expanduser(profiler_dir)
|
|
)
|
|
|
|
return self
|