diff --git a/vllm/config.py b/vllm/config.py index 1f7147f7cfd41..181fa803c620b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1294,6 +1294,12 @@ class LoadConfig: "tensorizer" will use CoreWeave's tensorizer library for fast weight loading. "bitsandbytes" will load nf4 type weights. + "sharded_state" will load weights from pre-sharded checkpoint files, + supporting efficient loading of tensor-parallel models. + "gguf" will load weights from GGUF format files. + "mistral" will load weights from consolidated safetensors files used + by Mistral models. + "runai_streamer" will load weights from RunAI streamer format files. model_loader_extra_config: The extra config for the model loader. ignore_patterns: The list of patterns to ignore when loading the model. Default to "original/**/*" to avoid repeated loading of llama's diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index edfa748b82d7b..e396e68f823d9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -339,9 +339,15 @@ class EngineArgs: 'CoreWeave. See the Tensorize vLLM Model script in the Examples ' 'section for more information.\n' '* "runai_streamer" will load the Safetensors weights using Run:ai' - 'Model Streamer \n' + 'Model Streamer.\n' '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') + 'quantization.\n' + '* "sharded_state" will load weights from pre-sharded checkpoint ' + 'files, supporting efficient loading of tensor-parallel models\n' + '* "gguf" will load weights from GGUF format files (details ' + 'specified in https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n' + '* "mistral" will load weights from consolidated safetensors files ' + 'used by Mistral models.\n') parser.add_argument( '--config-format', default=EngineArgs.config_format,