Expose PyTorch profiler configuration to environment variables (#21803)

Signed-off-by: Csrayz <33659823+Csrayz@users.noreply.github.com>
This commit is contained in:
Csrayz 2025-07-30 10:46:31 +08:00 committed by GitHub
parent fb58e3a651
commit b917da442b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 60 additions and 4 deletions

View File

@ -5,7 +5,12 @@
## Profile with PyTorch Profiler ## Profile with PyTorch Profiler
We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/` We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables:
- `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default
- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set. The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.

View File

@ -80,6 +80,10 @@ if TYPE_CHECKING:
VLLM_PLUGINS: Optional[list[str]] = None VLLM_PLUGINS: Optional[list[str]] = None
VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
VLLM_TORCH_PROFILER_DIR: Optional[str] = None VLLM_TORCH_PROFILER_DIR: Optional[str] = None
VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
VLLM_TORCH_PROFILER_WITH_STACK: bool = True
VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
VLLM_USE_TRITON_AWQ: bool = False VLLM_USE_TRITON_AWQ: bool = False
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
VLLM_SKIP_P2P_CHECK: bool = False VLLM_SKIP_P2P_CHECK: bool = False
@ -629,6 +633,31 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
.path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))), .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),
# Enable torch profiler to record shapes if set
# VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will
# not record shapes.
"VLLM_TORCH_PROFILER_RECORD_SHAPES":
lambda: bool(os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0"),
# Enable torch profiler to profile memory if set
# VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler
# will not profile memory.
"VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY":
lambda: bool(
os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0"),
# Enable torch profiler to profile stack if set
# VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL
# profile stack by default.
"VLLM_TORCH_PROFILER_WITH_STACK":
lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0"),
# Enable torch profiler to profile flops if set
# VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will
# not profile flops.
"VLLM_TORCH_PROFILER_WITH_FLOPS":
lambda: bool(os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"),
# If set, vLLM will use Triton implementations of AWQ. # If set, vLLM will use Triton implementations of AWQ.
"VLLM_USE_TRITON_AWQ": "VLLM_USE_TRITON_AWQ":
lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))), lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),

View File

@ -71,12 +71,23 @@ class Worker(WorkerBase):
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
logger.info("Profiling enabled. Traces will be saved to: %s", logger.info("Profiling enabled. Traces will be saved to: %s",
torch_profiler_trace_dir) torch_profiler_trace_dir)
logger.debug(
"Profiler config: record_shapes=%s,"
"profile_memory=%s,with_stack=%s,with_flops=%s",
envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
envs.VLLM_TORCH_PROFILER_WITH_STACK,
envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
)
self.profiler = torch.profiler.profile( self.profiler = torch.profiler.profile(
activities=[ activities=[
torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA, torch.profiler.ProfilerActivity.CUDA,
], ],
with_stack=True, record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
on_trace_ready=torch.profiler.tensorboard_trace_handler( on_trace_ready=torch.profiler.tensorboard_trace_handler(
torch_profiler_trace_dir, use_gzip=True)) torch_profiler_trace_dir, use_gzip=True))
else: else:
@ -209,7 +220,7 @@ class Worker(WorkerBase):
@torch.inference_mode() @torch.inference_mode()
def determine_available_memory(self) -> int: def determine_available_memory(self) -> int:
"""Profiles the peak memory usage of the model to determine how much """Profiles the peak memory usage of the model to determine how much
memory can be used for KV cache without OOMs. memory can be used for KV cache without OOMs.
The engine will first conduct a profiling of the existing memory usage. The engine will first conduct a profiling of the existing memory usage.

View File

@ -41,12 +41,23 @@ class XPUWorker(Worker):
torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
logger.info("Profiling enabled. Traces will be saved to: %s", logger.info("Profiling enabled. Traces will be saved to: %s",
torch_profiler_trace_dir) torch_profiler_trace_dir)
logger.debug(
"Profiler config: record_shapes=%s,"
"profile_memory=%s,with_stack=%s,with_flops=%s",
envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
envs.VLLM_TORCH_PROFILER_WITH_STACK,
envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
)
self.profiler = torch.profiler.profile( self.profiler = torch.profiler.profile(
activities=[ activities=[
torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.XPU, torch.profiler.ProfilerActivity.XPU,
], ],
with_stack=True, record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
on_trace_ready=torch.profiler.tensorboard_trace_handler( on_trace_ready=torch.profiler.tensorboard_trace_handler(
torch_profiler_trace_dir, use_gzip=True)) torch_profiler_trace_dir, use_gzip=True))
else: else: