From 6a84164adde79b225726949bbf47ca674f42ccdc Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 1 Mar 2025 14:10:28 +0800 Subject: [PATCH] [Bugfix] Add file lock for ModelScope download (#14060) Signed-off-by: Jee Jee Li --- benchmarks/backend_request_func.py | 15 ++++++++----- vllm/model_executor/model_loader/loader.py | 20 ++++++++++------- .../model_loader/weight_utils.py | 5 ++++- vllm/transformers_utils/tokenizer.py | 22 ++++++++++++------- 4 files changed, 40 insertions(+), 22 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 364b087b841d..e43549c13c8e 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -14,6 +14,8 @@ from tqdm.asyncio import tqdm from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) +from vllm.model_executor.model_loader.weight_utils import get_lock + AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) @@ -430,12 +432,15 @@ def get_model(pretrained_model_name_or_path: str) -> str: if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': from modelscope import snapshot_download - model_path = snapshot_download( - model_id=pretrained_model_name_or_path, - local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, - ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"]) + # Use file lock to prevent multiple processes from + # downloading the same model weights at the same time. + with get_lock(pretrained_model_name_or_path): + model_path = snapshot_download( + model_id=pretrained_model_name_or_path, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"]) - return model_path + return model_path return pretrained_model_name_or_path diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 46247eaf2a60..6244241d1891 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -49,7 +49,7 @@ from vllm.model_executor.model_loader.utils import (ParamMapping, from vllm.model_executor.model_loader.weight_utils import ( download_safetensors_index_file_from_hf, download_weights_from_hf, filter_duplicate_safetensors_files, filter_files_not_needed_for_inference, - get_gguf_extra_tensor_names, gguf_quant_weights_iterator, + get_gguf_extra_tensor_names, get_lock, gguf_quant_weights_iterator, initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator, runai_safetensors_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.utils import set_weight_attrs @@ -235,13 +235,17 @@ class DefaultModelLoader(BaseModelLoader): from modelscope.hub.snapshot_download import snapshot_download if not os.path.exists(model): - model_path = snapshot_download( - model_id=model, - cache_dir=self.load_config.download_dir, - local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, - revision=revision, - ignore_file_pattern=self.load_config.ignore_patterns, - ) + # Use file lock to prevent multiple processes from + # downloading the same model weights at the same time. + with get_lock(model, self.load_config.download_dir): + model_path = snapshot_download( + model_id=model, + cache_dir=self.load_config.download_dir, + local_files_only=huggingface_hub.constants. + HF_HUB_OFFLINE, + revision=revision, + ignore_file_pattern=self.load_config.ignore_patterns, + ) else: model_path = model return model_path diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 245c199f75b1..d184079fb25d 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -8,6 +8,7 @@ import os import tempfile import time from collections import defaultdict +from pathlib import Path from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union import filelock @@ -67,8 +68,10 @@ class DisabledTqdm(tqdm): super().__init__(*args, **kwargs, disable=True) -def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None): +def get_lock(model_name_or_path: Union[str, Path], + cache_dir: Optional[str] = None): lock_dir = cache_dir or temp_dir + model_name_or_path = str(model_name_or_path) os.makedirs(os.path.dirname(lock_dir), exist_ok=True) model_name = model_name_or_path.replace("/", "-") hash_name = hashlib.sha256(model_name.encode()).hexdigest() diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index f0aa5fdcaa61..2c34f2f5d44d 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -150,16 +150,22 @@ def get_tokenizer( # pylint: disable=C. from modelscope.hub.snapshot_download import snapshot_download + # avoid circuit import + from vllm.model_executor.model_loader.weight_utils import get_lock + # Only set the tokenizer here, model will be downloaded on the workers. if not os.path.exists(tokenizer_name): - tokenizer_path = snapshot_download( - model_id=tokenizer_name, - cache_dir=download_dir, - revision=revision, - local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, - # Ignore weights - we only need the tokenizer. - ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"]) - tokenizer_name = tokenizer_path + # Use file lock to prevent multiple processes from + # downloading the same file at the same time. + with get_lock(tokenizer_name, download_dir): + tokenizer_path = snapshot_download( + model_id=tokenizer_name, + cache_dir=download_dir, + revision=revision, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + # Ignore weights - we only need the tokenizer. + ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"]) + tokenizer_name = tokenizer_path if tokenizer_mode == "slow": if kwargs.get("use_fast", False):