mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-03 19:11:19 +08:00
[V1] Reuse V0's memory_profiling util for gpu worker memory profiling (#19312)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
This commit is contained in:
parent
3a7cd627a8
commit
cc867be19c
@ -2269,6 +2269,8 @@ def kill_process_tree(pid: int):
|
|||||||
class MemorySnapshot:
|
class MemorySnapshot:
|
||||||
"""Memory snapshot."""
|
"""Memory snapshot."""
|
||||||
torch_peak: int = 0
|
torch_peak: int = 0
|
||||||
|
free_memory: int = 0
|
||||||
|
total_memory: int = 0
|
||||||
cuda_memory: int = 0
|
cuda_memory: int = 0
|
||||||
torch_memory: int = 0
|
torch_memory: int = 0
|
||||||
non_torch_memory: int = 0
|
non_torch_memory: int = 0
|
||||||
@ -2288,8 +2290,8 @@ class MemorySnapshot:
|
|||||||
self.torch_peak = torch.cuda.memory_stats().get(
|
self.torch_peak = torch.cuda.memory_stats().get(
|
||||||
"allocated_bytes.all.peak", 0)
|
"allocated_bytes.all.peak", 0)
|
||||||
|
|
||||||
self.cuda_memory = torch.cuda.mem_get_info(
|
self.free_memory, self.total_memory = torch.cuda.mem_get_info()
|
||||||
)[1] - torch.cuda.mem_get_info()[0]
|
self.cuda_memory = self.total_memory - self.free_memory
|
||||||
|
|
||||||
# torch.cuda.memory_reserved() is how many bytes
|
# torch.cuda.memory_reserved() is how many bytes
|
||||||
# PyTorch gets from cuda (by calling cudaMalloc, etc.)
|
# PyTorch gets from cuda (by calling cudaMalloc, etc.)
|
||||||
@ -2302,6 +2304,8 @@ class MemorySnapshot:
|
|||||||
def __sub__(self, other: MemorySnapshot) -> MemorySnapshot:
|
def __sub__(self, other: MemorySnapshot) -> MemorySnapshot:
|
||||||
return MemorySnapshot(
|
return MemorySnapshot(
|
||||||
torch_peak=self.torch_peak - other.torch_peak,
|
torch_peak=self.torch_peak - other.torch_peak,
|
||||||
|
free_memory=self.free_memory - other.free_memory,
|
||||||
|
total_memory=self.total_memory - other.total_memory,
|
||||||
cuda_memory=self.cuda_memory - other.cuda_memory,
|
cuda_memory=self.cuda_memory - other.cuda_memory,
|
||||||
torch_memory=self.torch_memory - other.torch_memory,
|
torch_memory=self.torch_memory - other.torch_memory,
|
||||||
non_torch_memory=self.non_torch_memory - other.non_torch_memory,
|
non_torch_memory=self.non_torch_memory - other.non_torch_memory,
|
||||||
@ -2323,6 +2327,16 @@ class MemoryProfilingResult:
|
|||||||
after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
|
after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
|
||||||
profile_time: float = 0.0
|
profile_time: float = 0.0
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return (f"Memory profiling takes {self.profile_time:.2f} seconds. "
|
||||||
|
f"Total non KV cache memory: "
|
||||||
|
f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; "
|
||||||
|
f"torch peak memory increase: "
|
||||||
|
f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; "
|
||||||
|
f"non-torch forward increase memory: "
|
||||||
|
f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; "
|
||||||
|
f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB.")
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
def memory_profiling(
|
def memory_profiling(
|
||||||
|
|||||||
@ -22,7 +22,7 @@ from vllm.lora.request import LoRARequest
|
|||||||
from vllm.model_executor import set_random_seed
|
from vllm.model_executor import set_random_seed
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
from vllm.utils import GiB_bytes
|
from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling
|
||||||
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
|
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
|
||||||
from vllm.v1.outputs import ModelRunnerOutput
|
from vllm.v1.outputs import ModelRunnerOutput
|
||||||
from vllm.v1.utils import report_usage_stats
|
from vllm.v1.utils import report_usage_stats
|
||||||
@ -130,20 +130,22 @@ class Worker(WorkerBase):
|
|||||||
_check_if_gpu_supports_dtype(self.model_config.dtype)
|
_check_if_gpu_supports_dtype(self.model_config.dtype)
|
||||||
gc.collect()
|
gc.collect()
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
self.init_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
|
|
||||||
requested_memory = (total_gpu_memory *
|
# take current memory snapshot
|
||||||
self.cache_config.gpu_memory_utilization)
|
self.init_snapshot = MemorySnapshot()
|
||||||
if self.init_gpu_memory < requested_memory:
|
self.requested_memory = (self.init_snapshot.total_memory *
|
||||||
|
self.cache_config.gpu_memory_utilization)
|
||||||
|
if self.init_snapshot.free_memory < self.requested_memory:
|
||||||
GiB = lambda b: round(b / GiB_bytes, 2)
|
GiB = lambda b: round(b / GiB_bytes, 2)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Free memory on device ({GiB(self.init_gpu_memory)}/"
|
f"Free memory on device "
|
||||||
f"{GiB(total_gpu_memory)} GiB) on startup is less than "
|
f"({GiB(self.init_snapshot.free_memory)}/"
|
||||||
f"desired GPU memory utilization "
|
f"{GiB(self.init_snapshot.total_memory)} GiB) on startup "
|
||||||
|
f"is less than desired GPU memory utilization "
|
||||||
f"({self.cache_config.gpu_memory_utilization}, "
|
f"({self.cache_config.gpu_memory_utilization}, "
|
||||||
f"{GiB(requested_memory)} GiB). Decrease GPU memory "
|
f"{GiB(self.requested_memory)} GiB). Decrease GPU memory "
|
||||||
f"utilization or reduce GPU memory used by other processes."
|
f"utilization or reduce GPU memory used by other processes."
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Not support device type: {self.device_config.device}")
|
f"Not support device type: {self.device_config.device}")
|
||||||
@ -192,57 +194,39 @@ class Worker(WorkerBase):
|
|||||||
"""
|
"""
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
torch.cuda.reset_peak_memory_stats()
|
torch.cuda.reset_peak_memory_stats()
|
||||||
|
GiB = lambda b: b / GiB_bytes
|
||||||
|
|
||||||
_, total_gpu_memory = torch.cuda.mem_get_info()
|
|
||||||
# Execute a forward pass with dummy inputs to profile the memory usage
|
# Execute a forward pass with dummy inputs to profile the memory usage
|
||||||
# of the model.
|
# of the model.
|
||||||
self.model_runner.profile_run()
|
with memory_profiling(
|
||||||
|
self.init_snapshot,
|
||||||
|
weights_memory=int(
|
||||||
|
self.model_runner.model_memory_usage)) as profile_result:
|
||||||
|
self.model_runner.profile_run()
|
||||||
|
|
||||||
free_gpu_memory, _ = torch.cuda.mem_get_info()
|
free_gpu_memory = profile_result.after_profile.free_memory
|
||||||
# NOTE(woosuk): Here we assume that the other processes using the same
|
# NOTE(woosuk): Here we assume that the other processes using the same
|
||||||
# GPU did not change their memory usage during the profiling.
|
# GPU did not change their memory usage during the profiling.
|
||||||
assert self.init_gpu_memory > free_gpu_memory, (
|
assert self.init_snapshot.free_memory > free_gpu_memory, (
|
||||||
"Error in memory profiling. "
|
"Error in memory profiling. "
|
||||||
f"Initial free memory {self.init_gpu_memory/GiB_bytes} GiB, "
|
f"Initial free memory {GiB(self.init_snapshot.free_memory)} GiB, "
|
||||||
f"current free memory {free_gpu_memory/GiB_bytes} GiB. "
|
f"current free memory {GiB(free_gpu_memory)} GiB. "
|
||||||
f"This happens when the GPU memory was not properly cleaned up "
|
"This happens when other processes sharing the same container "
|
||||||
f"before initializing the vLLM instance.")
|
"release GPU memory while vLLM is profiling during initialization. "
|
||||||
|
"To fix this, ensure consistent GPU memory allocation or "
|
||||||
|
"isolate vLLM in its own container.")
|
||||||
|
available_kv_cache_memory = self.requested_memory \
|
||||||
|
- profile_result.non_kv_cache_memory
|
||||||
|
|
||||||
# Get the peak memory allocation recorded by torch
|
|
||||||
peak_torch_memory = torch.cuda.memory_stats(
|
|
||||||
)["allocated_bytes.all.peak"]
|
|
||||||
|
|
||||||
# Check for any memory left around that may have been allocated on the
|
|
||||||
# gpu outside of `torch`. NCCL operations, for example, can use a few
|
|
||||||
# GB during a forward pass.
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
torch_allocated_bytes = torch.cuda.memory_stats(
|
|
||||||
)["allocated_bytes.all.current"]
|
|
||||||
|
|
||||||
# Reset after emptying torch cache
|
|
||||||
free_gpu_memory = torch.cuda.mem_get_info()[0]
|
|
||||||
|
|
||||||
# Total forward allocation (current) is equal to the diff in free memory
|
|
||||||
fwd_alloc_bytes = self.init_gpu_memory - free_gpu_memory
|
|
||||||
# We assume current non-torch allocation is equal to peak
|
|
||||||
non_torch_alloc_bytes = max(0, fwd_alloc_bytes - torch_allocated_bytes)
|
|
||||||
# Total forward allocation (peak) is peak torch + non-torch
|
|
||||||
peak_memory = peak_torch_memory + non_torch_alloc_bytes
|
|
||||||
|
|
||||||
available_kv_cache_memory = (
|
|
||||||
total_gpu_memory * self.cache_config.gpu_memory_utilization -
|
|
||||||
peak_memory)
|
|
||||||
|
|
||||||
GiB = lambda b: b / GiB_bytes
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Initial free memory: %.2f GiB, free memory: %.2f GiB, "
|
"Initial free memory: %.2f GiB, free memory: %.2f GiB, "
|
||||||
"total GPU memory: %.2f GiB", GiB(self.init_gpu_memory),
|
"requested GPU memory: %.2f GiB",
|
||||||
GiB(free_gpu_memory), GiB(total_gpu_memory))
|
GiB(self.init_snapshot.free_memory), GiB(free_gpu_memory),
|
||||||
logger.debug(
|
GiB(self.requested_memory))
|
||||||
"Peak torch memory: %.2f GiB, non-torch forward-pass memory: "
|
logger.debug(profile_result)
|
||||||
"%.2f GiB, available KVCache memory: %.2f GiB",
|
logger.info("Available KV cache memory: %.2f GiB",
|
||||||
GiB(peak_torch_memory), GiB(non_torch_alloc_bytes),
|
GiB(available_kv_cache_memory))
|
||||||
GiB(available_kv_cache_memory))
|
gc.collect()
|
||||||
|
|
||||||
return int(available_kv_cache_memory)
|
return int(available_kv_cache_memory)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user