From 82fbeae92b86e404829a01441334a9505e8b190d Mon Sep 17 00:00:00 2001 From: Jun Duan Date: Sat, 1 Mar 2025 20:20:30 -0500 Subject: [PATCH] [Misc] Accurately capture the time of loading weights (#14063) Signed-off-by: Jun Duan --- vllm/model_executor/model_loader/loader.py | 11 +++++++++++ vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/worker/model_runner.py | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 6244241d18912..4f1092f68f50d 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -10,6 +10,7 @@ import inspect import itertools import math import os +import time import warnings from abc import ABC, abstractmethod from contextlib import contextmanager @@ -216,6 +217,9 @@ class DefaultModelLoader(BaseModelLoader): allow_patterns_overrides: Optional[list[str]] = None """If defined, weights will load exclusively using these patterns.""" + counter_before_loading_weights: float = 0.0 + counter_after_loading_weights: float = 0.0 + def __init__(self, load_config: LoadConfig): super().__init__(load_config) if load_config.model_loader_extra_config: @@ -368,6 +372,8 @@ class DefaultModelLoader(BaseModelLoader): weights_iterator = _xla_weights_iterator(weights_iterator) + if self.counter_before_loading_weights == 0.0: + self.counter_before_loading_weights = time.perf_counter() # Apply the prefix. return ((source.prefix + name, tensor) for (name, tensor) in weights_iterator) @@ -412,6 +418,11 @@ class DefaultModelLoader(BaseModelLoader): weights_to_load = {name for name, _ in model.named_parameters()} loaded_weights = model.load_weights( self._get_all_weights(model_config, model)) + self.counter_after_loading_weights = time.perf_counter() + logger.info( + "Loading weights took %.2f seconds", + self.counter_after_loading_weights - + self.counter_before_loading_weights) # We only enable strict check for non-quantized models # that have loaded weights tracking currently. if model_config.quantization is None and loaded_weights is not None: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0215b27353845..6785d66842696 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1061,7 +1061,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.device) time_after_load = time.perf_counter() self.model_memory_usage = m.consumed_memory - logger.info("Loading model weights took %.4f GB and %.6f seconds", + logger.info("Model loading took %.4f GB and %.6f seconds", self.model_memory_usage / float(2**30), time_after_load - time_before_load) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index bb2228165b528..0ea1d5dcbbb7f 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1114,7 +1114,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): time_after_load = time.perf_counter() self.model_memory_usage = m.consumed_memory - logger.info("Loading model weights took %.4f GB and %.6f seconds", + logger.info("Model loading took %.4f GB and %.6f seconds", self.model_memory_usage / float(2**30), time_after_load - time_before_load)