add support for tokenizer revision (#1163)

Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2025-12-20 08:35:01 +08:00 · 2023-10-02 22:19:46 -04:00 · 2023-10-02 22:19:46 -04:00 · 66d18a7fb0
commit 66d18a7fb0
parent ba0bfd40e2
5 changed files with 23 additions and 1 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@ -41,6 +41,9 @@ class ModelConfig:
        revision: The specific model version to use. It can be a branch name,
            a tag name, or a commit id. If unspecified, will use the default
            version.
        tokenizer_revision: The specific tokenizer version to use. It can be a
            branch name, a tag name, or a commit id. If unspecified, will use
            the default version.
        max_model_len: Maximum length of a sequence (including prompt and
            output). If None, will be derived from the model.
        quantization: Quantization method that was used to quantize the model
@ -58,6 +61,7 @@ class ModelConfig:
        dtype: str,
        seed: int,
        revision: Optional[str] = None,
        tokenizer_revision: Optional[str] = None,
        max_model_len: Optional[int] = None,
        quantization: Optional[str] = None,
    ) -> None:
@ -69,6 +73,7 @@ class ModelConfig:
        self.load_format = load_format
        self.seed = seed
        self.revision = revision
        self.tokenizer_revision = tokenizer_revision
        self.quantization = quantization
        self.hf_config = get_config(model, trust_remote_code, revision)
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -29,6 +29,7 @@ class EngineArgs:
    max_num_seqs: int = 256
    disable_log_stats: bool = False
    revision: Optional[str] = None
    tokenizer_revision: Optional[str] = None
    quantization: Optional[str] = None
    def __post_init__(self):
@ -57,6 +58,13 @@ class EngineArgs:
            help='the specific model version to use. It can be a branch '
            'name, a tag name, or a commit id. If unspecified, will use '
            'the default version.')
        parser.add_argument(
            '--tokenizer-revision',
            type=str,
            default=None,
            help='the specific tokenizer version to use. It can be a branch '
            'name, a tag name, or a commit id. If unspecified, will use '
            'the default version.')
        parser.add_argument('--tokenizer-mode',
                            type=str,
                            default=EngineArgs.tokenizer_mode,
@ -175,7 +183,8 @@ class EngineArgs:
                                   self.tokenizer_mode, self.trust_remote_code,
                                   self.download_dir, self.load_format,
                                   self.dtype, self.seed, self.revision,
-                                   self.max_model_len, self.quantization)
+                                   self.tokenizer_revision, self.max_model_len,
                                   self.quantization)
        cache_config = CacheConfig(
            self.block_size, self.gpu_memory_utilization, self.swap_space,
            getattr(model_config.hf_config, 'sliding_window', None))
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@ -75,6 +75,7 @@ class LLMEngine:
            f"tokenizer={model_config.tokenizer!r}, "
            f"tokenizer_mode={model_config.tokenizer_mode}, "
            f"revision={model_config.revision}, "
            f"tokenizer_revision={model_config.tokenizer_revision}, "
            f"trust_remote_code={model_config.trust_remote_code}, "
            f"dtype={model_config.dtype}, "
            f"max_seq_len={model_config.max_model_len}, "
@ -98,6 +99,7 @@ class LLMEngine:
            model_config.tokenizer,
            tokenizer_mode=model_config.tokenizer_mode,
            trust_remote_code=model_config.trust_remote_code,
            tokenizer_revision=model_config.tokenizer_revision,
            revision=model_config.revision)
        self.seq_counter = Counter()
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -42,6 +42,8 @@ class LLM:
            quantized and use `dtype` to determine the data type of the weights.
        revision: The specific model version to use. It can be a branch name,
            a tag name, or a commit id.
        tokenizer_revision: The specific tokenizer version to use. It can be a
            branch name, a tag name, or a commit id.
        seed: The seed to initialize the random number generator for sampling.
        gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
            reserve for the model weights, activations, and KV cache. Higher
@ -65,6 +67,7 @@ class LLM:
        dtype: str = "auto",
        quantization: Optional[str] = None,
        revision: Optional[str] = None,
        tokenizer_revision: Optional[str] = None,
        seed: int = 0,
        gpu_memory_utilization: float = 0.9,
        swap_space: int = 4,
@ -81,6 +84,7 @@ class LLM:
            dtype=dtype,
            quantization=quantization,
            revision=revision,
            tokenizer_revision=tokenizer_revision,
            seed=seed,
            gpu_memory_utilization=gpu_memory_utilization,
            swap_space=swap_space,
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@ -16,6 +16,7 @@ def get_tokenizer(
    *args,
    tokenizer_mode: str = "auto",
    trust_remote_code: bool = False,
    tokenizer_revision: Optional[str] = None,
    **kwargs,
 ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
    """Gets a tokenizer for the given model name via Huggingface."""
@ -37,6 +38,7 @@ def get_tokenizer(
            tokenizer_name,
            *args,
            trust_remote_code=trust_remote_code,
            tokenizer_revision=tokenizer_revision,
            **kwargs)
    except TypeError as e:
        # The LLaMA tokenizer causes a protobuf error in some environments.