From 68d535ef442384478797aa64738ec8e96b6b43d3 Mon Sep 17 00:00:00 2001 From: Jun Duan Date: Sat, 22 Feb 2025 01:06:34 -0500 Subject: [PATCH] [Misc] Capture and log the time of loading weights (#13666) --- vllm/v1/worker/gpu_model_runner.py | 8 +++++--- vllm/worker/model_runner.py | 7 +++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 31fe095a91bc0..d2e9c2650c7b5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1048,6 +1048,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): def load_model(self) -> None: logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: # noqa: SIM117 + time_before_load = time.perf_counter() self.model = get_model(vllm_config=self.vllm_config) if self.lora_config: self.model = self.load_lora_model(self.model, @@ -1055,10 +1056,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.scheduler_config, self.lora_config, self.device) - + time_after_load = time.perf_counter() self.model_memory_usage = m.consumed_memory - logger.info("Loading model weights took %.4f GB", - self.model_memory_usage / float(2**30)) + logger.info("Loading model weights took %.4f GB and %.6f seconds", + self.model_memory_usage / float(2**30), + time_after_load - time_before_load) def _get_prompt_logprobs_dict( self, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 67d175c373d82..1a78498ad1240 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1109,11 +1109,14 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): def load_model(self) -> None: logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler(self.device) as m: + time_before_load = time.perf_counter() self.model = get_model(vllm_config=self.vllm_config) + time_after_load = time.perf_counter() self.model_memory_usage = m.consumed_memory - logger.info("Loading model weights took %.4f GB", - self.model_memory_usage / float(2**30)) + logger.info("Loading model weights took %.4f GB and %.6f seconds", + self.model_memory_usage / float(2**30), + time_after_load - time_before_load) if self.lora_config: assert supports_lora(