From 2497228ad4427310bc55427f6db404a00de4fd78 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 17 Dec 2025 23:32:17 +0800 Subject: [PATCH] [Chore] Factor out logic for requesting initial memory (#30868) Signed-off-by: DarkLight1337 --- vllm/utils/mem_utils.py | 31 +++++++++++++++++++++++++++---- vllm/v1/worker/gpu_worker.py | 20 ++++---------------- vllm/v1/worker/utils.py | 26 +++++++++++++++++++++++++- 3 files changed, 56 insertions(+), 21 deletions(-) diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py index e2517b935bf28..bf6d7846573b9 100644 --- a/vllm/utils/mem_utils.py +++ b/vllm/utils/mem_utils.py @@ -66,27 +66,43 @@ class MemorySnapshot: torch_memory: int = 0 non_torch_memory: int = 0 timestamp: float = 0.0 + + device: torch.types.Device = None auto_measure: bool = True def __post_init__(self) -> None: + if self.device is None: + from vllm.platforms import current_platform + + device_fn = current_platform.current_device + assert device_fn is not None + self.device_ = torch.device(device_fn()) + else: + self.device_ = torch.device(self.device) + if self.auto_measure: self.measure() def measure(self) -> None: from vllm.platforms import current_platform + device = self.device_ + # we measure the torch peak memory usage via allocated_bytes, # rather than `torch.cuda.memory_reserved()` . # After `torch.cuda.reset_peak_memory_stats()`, # `torch.cuda.memory_reserved()` will keep growing, and only shrink # when we call `torch.cuda.empty_cache()` or OOM happens. - self.torch_peak = torch.cuda.memory_stats().get("allocated_bytes.all.peak", 0) + self.torch_peak = torch.cuda.memory_stats(device).get( + "allocated_bytes.all.peak", 0 + ) - self.free_memory, self.total_memory = torch.cuda.mem_get_info() + self.free_memory, self.total_memory = torch.cuda.mem_get_info(device) shared_sysmem_device_mem_sms = ((8, 7), (11, 0), (12, 1)) # Orin, Thor, Spark if ( current_platform.is_cuda() - and current_platform.get_device_capability() in shared_sysmem_device_mem_sms + and current_platform.get_device_capability(device.index) + in shared_sysmem_device_mem_sms ): # On UMA (Orin, Thor and Spark) platform, # where both CPU and GPU rely on system memory, @@ -106,12 +122,18 @@ class MemorySnapshot: # torch.cuda.memory_reserved() is how many bytes # PyTorch gets from cuda (by calling cudaMalloc, etc.) # this is used to measure the non-torch memory usage - self.torch_memory = torch.cuda.memory_reserved() + self.torch_memory = torch.cuda.memory_reserved(device) self.non_torch_memory = self.cuda_memory - self.torch_memory self.timestamp = time.time() def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot": + if self.device_ != other.device_: + raise ValueError( + "The two snapshots should be from the same device! " + f"Found: {self.device_} vs. {other.device_}" + ) + return MemorySnapshot( torch_peak=self.torch_peak - other.torch_peak, free_memory=self.free_memory - other.free_memory, @@ -120,6 +142,7 @@ class MemorySnapshot: torch_memory=self.torch_memory - other.torch_memory, non_torch_memory=self.non_torch_memory - other.non_torch_memory, timestamp=self.timestamp - other.timestamp, + device=self.device_, auto_measure=False, ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 1e13650cd083e..bc71351d2cc55 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -56,6 +56,8 @@ from vllm.v1.worker.utils import is_residual_scattered_for_sp from vllm.v1.worker.worker_base import WorkerBase from vllm.v1.worker.workspace import init_workspace_manager +from .utils import request_memory + logger = init_logger(__name__) if TYPE_CHECKING: @@ -237,22 +239,8 @@ class Worker(WorkerBase): torch.cuda.empty_cache() # take current memory snapshot - self.init_snapshot = MemorySnapshot() - self.requested_memory = ( - self.init_snapshot.total_memory - * self.cache_config.gpu_memory_utilization - ) - if self.init_snapshot.free_memory < self.requested_memory: - GiB = lambda b: round(b / GiB_bytes, 2) - raise ValueError( - f"Free memory on device " - f"({GiB(self.init_snapshot.free_memory)}/" - f"{GiB(self.init_snapshot.total_memory)} GiB) on startup " - f"is less than desired GPU memory utilization " - f"({self.cache_config.gpu_memory_utilization}, " - f"{GiB(self.requested_memory)} GiB). Decrease GPU memory " - f"utilization or reduce GPU memory used by other processes." - ) + self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device) + self.requested_memory = request_memory(init_snapshot, self.cache_config) else: raise RuntimeError(f"Not support device type: {self.device_config.device}") diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 2e8afec024ce9..31ccf7f157468 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -8,13 +8,15 @@ from typing_extensions import deprecated from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention -from vllm.config import ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig from vllm.logger import init_logger from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.cache import processor_only_cache_from_config from vllm.multimodal.registry import MultiModalRegistry from vllm.platforms import current_platform +from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.mem_utils import MemorySnapshot from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec @@ -248,6 +250,28 @@ def gather_mm_placeholders( return placeholders[is_embed] +def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> float: + """ + Calculate the amount of memory required by vLLM, then validate + that the current amount of free memory is sufficient for that. + """ + requested_memory = init_snapshot.total_memory * cache_config.gpu_memory_utilization + + if init_snapshot.free_memory < requested_memory: + GiB = lambda b: round(b / GiB_bytes, 2) + raise ValueError( + f"Free memory on device {init_snapshot.device_} " + f"({GiB(init_snapshot.free_memory)}/" + f"{GiB(init_snapshot.total_memory)} GiB) on startup " + f"is less than desired GPU memory utilization " + f"({cache_config.gpu_memory_utilization}, " + f"{GiB(requested_memory)} GiB). Decrease GPU memory " + f"utilization or reduce GPU memory used by other processes." + ) + + return requested_memory + + def add_kv_sharing_layers_to_kv_cache_groups( shared_kv_cache_layers: dict[str, str], kv_cache_groups: list[KVCacheGroupSpec],