diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f0404e0bc6eac..93692a0c9772e 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -132,6 +132,14 @@ class LLM: hf_overrides: If a dictionary, contains arguments to be forwarded to the HuggingFace config. If a callable, it is called to update the HuggingFace config. + mm_processor_kwargs: Arguments to be forwarded to the model's processor + for multi-modal data, e.g., image processor. Overrides for the + multi-modal processor obtained from `AutoProcessor.from_pretrained`. + The available overrides depend on the model that is being run. + For example, for Phi-3-Vision: `{"num_crops": 4}`. + override_pooler_config: Initialize non-default pooling config or + override default pooling config for the pooling model. + e.g. `PoolerConfig(pooling_type="mean", normalize=False)`. compilation_config: Either an integer or a dictionary. If it is an integer, it is used as the level of compilation optimization. If it is a dictionary, it can specify the full compilation configuration. @@ -1347,16 +1355,16 @@ class LLM: during the sleep period, before `wake_up` is called. Args: - level: The sleep level. Level 1 sleep will offload the model - weights and discard the kv cache. The content of kv cache + level: The sleep level. Level 1 sleep will offload the model + weights and discard the kv cache. The content of kv cache is forgotten. Level 1 sleep is good for sleeping and waking - up the engine to run the same model again. The model weights - are backed up in CPU memory. Please make sure there's enough - CPU memory to store the model weights. Level 2 sleep will - discard both the model weights and the kv cache. The content - of both the model weights and kv cache is forgotten. Level 2 + up the engine to run the same model again. The model weights + are backed up in CPU memory. Please make sure there's enough + CPU memory to store the model weights. Level 2 sleep will + discard both the model weights and the kv cache. The content + of both the model weights and kv cache is forgotten. Level 2 sleep is good for sleeping and waking up the engine to run a - different model or update the model, where previous model + different model or update the model, where previous model weights are not needed. It reduces CPU memory pressure. """ self.reset_prefix_cache() @@ -1366,12 +1374,12 @@ class LLM: """ Wake up the engine from sleep mode. See the [sleep][] method for more details. - + Args: - tags: An optional list of tags to reallocate the engine memory - for specific memory allocations. Values must be in + tags: An optional list of tags to reallocate the engine memory + for specific memory allocations. Values must be in `("weights", "kv_cache")`. If None, all memory is reallocated. - wake_up should be called with all tags (or None) before the + wake_up should be called with all tags (or None) before the engine is used again. """ self.llm_engine.wake_up(tags)