mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-11 04:54:59 +08:00
[Tokenizer] Add tokenizer mode (#298)
This commit is contained in:
parent
425040d4c1
commit
998d9d1509
@ -17,6 +17,8 @@ class ModelConfig:
|
|||||||
Args:
|
Args:
|
||||||
model: Name or path of the huggingface model to use.
|
model: Name or path of the huggingface model to use.
|
||||||
tokenizer: Name or path of the huggingface tokenizer to use.
|
tokenizer: Name or path of the huggingface tokenizer to use.
|
||||||
|
tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
|
||||||
|
available, and "slow" will always use the slow tokenizer.
|
||||||
download_dir: Directory to download and load the weights, default to the
|
download_dir: Directory to download and load the weights, default to the
|
||||||
default cache directory of huggingface.
|
default cache directory of huggingface.
|
||||||
use_np_weights: Save a numpy copy of model weights for faster loading.
|
use_np_weights: Save a numpy copy of model weights for faster loading.
|
||||||
@ -31,7 +33,8 @@ class ModelConfig:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
tokenizer: Optional[str],
|
tokenizer: str,
|
||||||
|
tokenizer_mode: str,
|
||||||
download_dir: Optional[str],
|
download_dir: Optional[str],
|
||||||
use_np_weights: bool,
|
use_np_weights: bool,
|
||||||
use_dummy_weights: bool,
|
use_dummy_weights: bool,
|
||||||
@ -40,6 +43,7 @@ class ModelConfig:
|
|||||||
) -> None:
|
) -> None:
|
||||||
self.model = model
|
self.model = model
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
|
self.tokenizer_mode = tokenizer_mode
|
||||||
self.download_dir = download_dir
|
self.download_dir = download_dir
|
||||||
self.use_np_weights = use_np_weights
|
self.use_np_weights = use_np_weights
|
||||||
self.use_dummy_weights = use_dummy_weights
|
self.use_dummy_weights = use_dummy_weights
|
||||||
@ -47,6 +51,15 @@ class ModelConfig:
|
|||||||
|
|
||||||
self.hf_config: PretrainedConfig = AutoConfig.from_pretrained(model)
|
self.hf_config: PretrainedConfig = AutoConfig.from_pretrained(model)
|
||||||
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
|
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
|
||||||
|
self._verify_tokenizer_mode()
|
||||||
|
|
||||||
|
def _verify_tokenizer_mode(self) -> None:
|
||||||
|
tokenizer_mode = self.tokenizer_mode.lower()
|
||||||
|
if tokenizer_mode not in ["auto", "slow"]:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
|
||||||
|
"either 'auto' or 'slow'.")
|
||||||
|
self.tokenizer_mode = tokenizer_mode
|
||||||
|
|
||||||
def verify_with_parallel_config(
|
def verify_with_parallel_config(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -12,6 +12,7 @@ class EngineArgs:
|
|||||||
"""Arguments for vLLM engine."""
|
"""Arguments for vLLM engine."""
|
||||||
model: str
|
model: str
|
||||||
tokenizer: Optional[str] = None
|
tokenizer: Optional[str] = None
|
||||||
|
tokenizer_mode: str = "auto"
|
||||||
download_dir: Optional[str] = None
|
download_dir: Optional[str] = None
|
||||||
use_np_weights: bool = False
|
use_np_weights: bool = False
|
||||||
use_dummy_weights: bool = False
|
use_dummy_weights: bool = False
|
||||||
@ -42,6 +43,12 @@ class EngineArgs:
|
|||||||
help='name or path of the huggingface model to use')
|
help='name or path of the huggingface model to use')
|
||||||
parser.add_argument('--tokenizer', type=str, default=EngineArgs.tokenizer,
|
parser.add_argument('--tokenizer', type=str, default=EngineArgs.tokenizer,
|
||||||
help='name or path of the huggingface tokenizer to use')
|
help='name or path of the huggingface tokenizer to use')
|
||||||
|
parser.add_argument('--tokenizer-mode', type=str,
|
||||||
|
default=EngineArgs.tokenizer_mode,
|
||||||
|
choices=['auto', 'slow'],
|
||||||
|
help='tokenizer mode. "auto" will use the fast '
|
||||||
|
'tokenizer if available, and "slow" will '
|
||||||
|
'always use the slow tokenizer.')
|
||||||
parser.add_argument('--download-dir', type=str,
|
parser.add_argument('--download-dir', type=str,
|
||||||
default=EngineArgs.download_dir,
|
default=EngineArgs.download_dir,
|
||||||
help='directory to download and load the weights, '
|
help='directory to download and load the weights, '
|
||||||
@ -109,8 +116,8 @@ class EngineArgs:
|
|||||||
) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
|
) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
|
||||||
# Initialize the configs.
|
# Initialize the configs.
|
||||||
model_config = ModelConfig(
|
model_config = ModelConfig(
|
||||||
self.model, self.tokenizer, self.download_dir, self.use_np_weights,
|
self.model, self.tokenizer, self.tokenizer_mode, self.download_dir,
|
||||||
self.use_dummy_weights, self.dtype, self.seed)
|
self.use_np_weights, self.use_dummy_weights, self.dtype, self.seed)
|
||||||
cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization,
|
cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization,
|
||||||
self.swap_space)
|
self.swap_space)
|
||||||
parallel_config = ParallelConfig(self.pipeline_parallel_size,
|
parallel_config = ParallelConfig(self.pipeline_parallel_size,
|
||||||
|
|||||||
@ -61,6 +61,7 @@ class LLMEngine:
|
|||||||
"Initializing an LLM engine with config: "
|
"Initializing an LLM engine with config: "
|
||||||
f"model={model_config.model!r}, "
|
f"model={model_config.model!r}, "
|
||||||
f"tokenizer={model_config.tokenizer!r}, "
|
f"tokenizer={model_config.tokenizer!r}, "
|
||||||
|
f"tokenizer_mode={model_config.tokenizer_mode}, "
|
||||||
f"dtype={model_config.dtype}, "
|
f"dtype={model_config.dtype}, "
|
||||||
f"use_dummy_weights={model_config.use_dummy_weights}, "
|
f"use_dummy_weights={model_config.use_dummy_weights}, "
|
||||||
f"download_dir={model_config.download_dir!r}, "
|
f"download_dir={model_config.download_dir!r}, "
|
||||||
@ -77,7 +78,8 @@ class LLMEngine:
|
|||||||
self.log_stats = log_stats
|
self.log_stats = log_stats
|
||||||
self._verify_args()
|
self._verify_args()
|
||||||
|
|
||||||
self.tokenizer = get_tokenizer(model_config.tokenizer)
|
self.tokenizer = get_tokenizer(model_config.tokenizer,
|
||||||
|
model_config.tokenizer_mode)
|
||||||
self.seq_counter = Counter()
|
self.seq_counter = Counter()
|
||||||
|
|
||||||
# Create the parallel GPU workers.
|
# Create the parallel GPU workers.
|
||||||
|
|||||||
@ -26,6 +26,8 @@ class LLM:
|
|||||||
Args:
|
Args:
|
||||||
model: The name or path of a HuggingFace Transformers model.
|
model: The name or path of a HuggingFace Transformers model.
|
||||||
tokenizer: The name or path of a HuggingFace Transformers tokenizer.
|
tokenizer: The name or path of a HuggingFace Transformers tokenizer.
|
||||||
|
tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
|
||||||
|
if available, and "slow" will always use the slow tokenizer.
|
||||||
tensor_parallel_size: The number of GPUs to use for distributed
|
tensor_parallel_size: The number of GPUs to use for distributed
|
||||||
execution with tensor parallelism.
|
execution with tensor parallelism.
|
||||||
dtype: The data type for the model weights and activations. Currently,
|
dtype: The data type for the model weights and activations. Currently,
|
||||||
@ -40,6 +42,7 @@ class LLM:
|
|||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
tokenizer: Optional[str] = None,
|
tokenizer: Optional[str] = None,
|
||||||
|
tokenizer_mode: str = "auto",
|
||||||
tensor_parallel_size: int = 1,
|
tensor_parallel_size: int = 1,
|
||||||
dtype: str = "auto",
|
dtype: str = "auto",
|
||||||
seed: int = 0,
|
seed: int = 0,
|
||||||
@ -50,6 +53,7 @@ class LLM:
|
|||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model=model,
|
model=model,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
|
tokenizer_mode=tokenizer_mode,
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
seed=seed,
|
seed=seed,
|
||||||
|
|||||||
@ -313,7 +313,7 @@ if __name__ == "__main__":
|
|||||||
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||||
|
|
||||||
# A separate tokenizer to map token IDs to strings.
|
# A separate tokenizer to map token IDs to strings.
|
||||||
tokenizer = get_tokenizer(args.model)
|
tokenizer = get_tokenizer(engine_args.tokenizer, engine_args.tokenizer_mode)
|
||||||
|
|
||||||
uvicorn.run(app, host=args.host, port=args.port, log_level="info",
|
uvicorn.run(app, host=args.host, port=args.port, log_level="info",
|
||||||
timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
|
timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
|
||||||
|
|||||||
@ -13,10 +13,17 @@ _FAST_LLAMA_TOKENIZER = "hf-internal-testing/llama-tokenizer"
|
|||||||
|
|
||||||
def get_tokenizer(
|
def get_tokenizer(
|
||||||
tokenizer_name: str,
|
tokenizer_name: str,
|
||||||
|
tokenizer_mode: str = "auto",
|
||||||
*args,
|
*args,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||||
"""Gets a tokenizer for the given model name via Huggingface."""
|
"""Gets a tokenizer for the given model name via Huggingface."""
|
||||||
|
if tokenizer_mode == "slow":
|
||||||
|
if kwargs.get("use_fast", False):
|
||||||
|
raise ValueError(
|
||||||
|
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||||
|
kwargs["use_fast"] = False
|
||||||
|
|
||||||
if "llama" in tokenizer_name.lower() and kwargs.get("use_fast", True):
|
if "llama" in tokenizer_name.lower() and kwargs.get("use_fast", True):
|
||||||
logger.info(
|
logger.info(
|
||||||
"For some LLaMA-based models, initializing the fast tokenizer may "
|
"For some LLaMA-based models, initializing the fast tokenizer may "
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user