diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 72fec5e205e3..14fc5589a89a 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3600,9 +3600,6 @@ class VllmConfig: logger.info(reason) self.scheduler_config.chunked_prefill_enabled = False self.scheduler_config.long_prefill_token_threshold = 0 - self.scheduler_config.max_num_batched_tokens = max( - self.scheduler_config.max_model_len, - DEFAULT_MAX_NUM_BATCHED_TOKENS) if self.cache_config is not None: self.cache_config.enable_prefix_caching = False diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f8af6d36e0c0..630fbec4539e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1602,9 +1602,6 @@ class EngineArgs: self.enable_prefix_caching = incremental_prefill_supported logger.info("(%s) prefix caching by default", action) - if not self.enable_chunked_prefill: - self.max_num_batched_tokens = model_config.max_model_len - # V1 should use the new scheduler by default. # Swap it only if this arg is set to the original V0 default if self.scheduler_cls == EngineArgs.scheduler_cls: @@ -1692,8 +1689,11 @@ class EngineArgs: self.max_num_batched_tokens = \ default_max_num_batched_tokens[usage_context] else: - self.max_num_batched_tokens = default_max_num_batched_tokens[ - usage_context] + if not self.enable_chunked_prefill: + self.max_num_batched_tokens = model_config.max_model_len + else: + self.max_num_batched_tokens = \ + default_max_num_batched_tokens[usage_context] logger.debug( "Setting max_num_batched_tokens to %d for %s usage context.", self.max_num_batched_tokens, use_context_value)