diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 93ac18dfcc7b4..f947b6438c7cc 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -337,6 +337,10 @@ def test_human_readable_model_len(): args = parser.parse_args(["--max-model-len", "10.212345k"]) assert args.max_model_len == 10212 + # Auto via -1 + args = parser.parse_args(["--max-model-len", "-1"]) + assert args.max_model_len == -1 + # Invalid (do not allow decimals with binary multipliers) for invalid in ["1a", "pwd", "10.24", "1.23M"]: with pytest.raises(ArgumentError): diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index cfc5e07d83299..de984c1306ecd 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -345,7 +345,13 @@ class ModelConfig: format. Examples:\n - 1k -> 1000\n - 1K -> 1024\n - - 25.6k -> 25,600""" + - 25.6k -> 25,600\n + + Pass ``-1`` to automatically choose the largest length that fits + in available GPU memory.""" + auto_max_model_len: bool = False + """Automatically determine the maximum model length that fits in GPU + memory. Enabled when ``--max-model-len`` is ``-1``.""" spec_target_max_model_len: Optional[int] = None """Specify the maximum length for spec decoding draft models.""" quantization: SkipValidation[Optional[QuantizationMethods]] = None diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 06bd97dd6abe9..d1f8d48e7a113 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -227,7 +227,9 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]: elif contains_type(type_hints, int): kwargs[name]["type"] = int # Special case for large integers - if name in {"max_model_len", "max_num_batched_tokens"}: + if name == "max_model_len": + kwargs[name]["type"] = human_readable_int + elif name == "max_num_batched_tokens": kwargs[name]["type"] = human_readable_int elif contains_type(type_hints, float): kwargs[name]["type"] = float @@ -945,6 +947,12 @@ class EngineArgs: self.mm_encoder_tp_mode = "data" + max_model_len = self.max_model_len + auto_max_model_len = False + if max_model_len is not None and max_model_len < 0: + auto_max_model_len = True + max_model_len = None + return ModelConfig( model=self.model, hf_config_path=self.hf_config_path, @@ -964,7 +972,8 @@ class EngineArgs: hf_token=self.hf_token, hf_overrides=self.hf_overrides, tokenizer_revision=self.tokenizer_revision, - max_model_len=self.max_model_len, + max_model_len=max_model_len, + auto_max_model_len=auto_max_model_len, quantization=self.quantization, enforce_eager=self.enforce_eager, max_seq_len_to_capture=self.max_seq_len_to_capture, @@ -1847,3 +1856,4 @@ def human_readable_int(value): # Regular plain number. return int(value) + diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 6a62c55fb2d5f..9f714d7de4740 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -655,6 +655,9 @@ def estimate_max_model_len(vllm_config: VllmConfig, left = mid + 1 else: right = mid - 1 + + # Restore the original max_model_len before returning. + vllm_config.model_config.max_model_len = current_max return result @@ -690,6 +693,17 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig, # Estimate the maximum model length that can fit in the available memory estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec, available_memory) + if vllm_config.model_config.auto_max_model_len: + if estimated_max_len <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + logger.info( + "Setting max_model_len to %s based on available memory.", + estimated_max_len) + vllm_config.recalculate_max_model_len(estimated_max_len) + return + estimated_msg = "" if estimated_max_len > 0: estimated_msg = (