diff --git a/vllm/config.py b/vllm/config.py index 9b92d9706c9dd..7a94179859527 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -41,6 +41,9 @@ class ModelConfig: revision: The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. + tokenizer_revision: The specific tokenizer version to use. It can be a + branch name, a tag name, or a commit id. If unspecified, will use + the default version. max_model_len: Maximum length of a sequence (including prompt and output). If None, will be derived from the model. quantization: Quantization method that was used to quantize the model @@ -58,6 +61,7 @@ class ModelConfig: dtype: str, seed: int, revision: Optional[str] = None, + tokenizer_revision: Optional[str] = None, max_model_len: Optional[int] = None, quantization: Optional[str] = None, ) -> None: @@ -69,6 +73,7 @@ class ModelConfig: self.load_format = load_format self.seed = seed self.revision = revision + self.tokenizer_revision = tokenizer_revision self.quantization = quantization self.hf_config = get_config(model, trust_remote_code, revision) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 1e163a2bfb6ad..88f19bbc6da5b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -29,6 +29,7 @@ class EngineArgs: max_num_seqs: int = 256 disable_log_stats: bool = False revision: Optional[str] = None + tokenizer_revision: Optional[str] = None quantization: Optional[str] = None def __post_init__(self): @@ -57,6 +58,13 @@ class EngineArgs: help='the specific model version to use. It can be a branch ' 'name, a tag name, or a commit id. If unspecified, will use ' 'the default version.') + parser.add_argument( + '--tokenizer-revision', + type=str, + default=None, + help='the specific tokenizer version to use. It can be a branch ' + 'name, a tag name, or a commit id. If unspecified, will use ' + 'the default version.') parser.add_argument('--tokenizer-mode', type=str, default=EngineArgs.tokenizer_mode, @@ -175,7 +183,8 @@ class EngineArgs: self.tokenizer_mode, self.trust_remote_code, self.download_dir, self.load_format, self.dtype, self.seed, self.revision, - self.max_model_len, self.quantization) + self.tokenizer_revision, self.max_model_len, + self.quantization) cache_config = CacheConfig( self.block_size, self.gpu_memory_utilization, self.swap_space, getattr(model_config.hf_config, 'sliding_window', None)) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c1874f13f07da..76cafd87684d7 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -75,6 +75,7 @@ class LLMEngine: f"tokenizer={model_config.tokenizer!r}, " f"tokenizer_mode={model_config.tokenizer_mode}, " f"revision={model_config.revision}, " + f"tokenizer_revision={model_config.tokenizer_revision}, " f"trust_remote_code={model_config.trust_remote_code}, " f"dtype={model_config.dtype}, " f"max_seq_len={model_config.max_model_len}, " @@ -98,6 +99,7 @@ class LLMEngine: model_config.tokenizer, tokenizer_mode=model_config.tokenizer_mode, trust_remote_code=model_config.trust_remote_code, + tokenizer_revision=model_config.tokenizer_revision, revision=model_config.revision) self.seq_counter = Counter() diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 6361364c75e30..9dddfc1acd9cc 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -42,6 +42,8 @@ class LLM: quantized and use `dtype` to determine the data type of the weights. revision: The specific model version to use. It can be a branch name, a tag name, or a commit id. + tokenizer_revision: The specific tokenizer version to use. It can be a + branch name, a tag name, or a commit id. seed: The seed to initialize the random number generator for sampling. gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache. Higher @@ -65,6 +67,7 @@ class LLM: dtype: str = "auto", quantization: Optional[str] = None, revision: Optional[str] = None, + tokenizer_revision: Optional[str] = None, seed: int = 0, gpu_memory_utilization: float = 0.9, swap_space: int = 4, @@ -81,6 +84,7 @@ class LLM: dtype=dtype, quantization=quantization, revision=revision, + tokenizer_revision=tokenizer_revision, seed=seed, gpu_memory_utilization=gpu_memory_utilization, swap_space=swap_space, diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 74ce14c983813..6dafdac96f47f 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -16,6 +16,7 @@ def get_tokenizer( *args, tokenizer_mode: str = "auto", trust_remote_code: bool = False, + tokenizer_revision: Optional[str] = None, **kwargs, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: """Gets a tokenizer for the given model name via Huggingface.""" @@ -37,6 +38,7 @@ def get_tokenizer( tokenizer_name, *args, trust_remote_code=trust_remote_code, + tokenizer_revision=tokenizer_revision, **kwargs) except TypeError as e: # The LLaMA tokenizer causes a protobuf error in some environments.