diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py index 52c2363c89874..c5ae35985c17c 100644 --- a/examples/offline_inference/load_sharded_state.py +++ b/examples/offline_inference/load_sharded_state.py @@ -11,7 +11,7 @@ python save_sharded_state.py \ --model /path/to/load \ --quantization deepspeedfp \ --tensor-parallel-size 8 \ - --output /path/to/save/sharded/modele + --output /path/to/save/sharded/model python load_sharded_state.py \ --model /path/to/saved/sharded/model \ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c225479810aa4..3709710ef42e7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3035,7 +3035,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): time_after_load = time.perf_counter() self.model_memory_usage = m.consumed_memory logger.info_once( - "Model loading took %.4f GiB and %.6f seconds", + "Model loading took %.4f GiB memory and %.6f seconds", self.model_memory_usage / GiB_bytes, time_after_load - time_before_load, scope="local",