mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 18:06:03 +08:00
Fix handling of max_num_batched_tokens for pooling tasks (#23004)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
This commit is contained in:
parent
829bbd7882
commit
52ce1420e9
@ -3600,9 +3600,6 @@ class VllmConfig:
|
|||||||
logger.info(reason)
|
logger.info(reason)
|
||||||
self.scheduler_config.chunked_prefill_enabled = False
|
self.scheduler_config.chunked_prefill_enabled = False
|
||||||
self.scheduler_config.long_prefill_token_threshold = 0
|
self.scheduler_config.long_prefill_token_threshold = 0
|
||||||
self.scheduler_config.max_num_batched_tokens = max(
|
|
||||||
self.scheduler_config.max_model_len,
|
|
||||||
DEFAULT_MAX_NUM_BATCHED_TOKENS)
|
|
||||||
|
|
||||||
if self.cache_config is not None:
|
if self.cache_config is not None:
|
||||||
self.cache_config.enable_prefix_caching = False
|
self.cache_config.enable_prefix_caching = False
|
||||||
|
|||||||
@ -1602,9 +1602,6 @@ class EngineArgs:
|
|||||||
self.enable_prefix_caching = incremental_prefill_supported
|
self.enable_prefix_caching = incremental_prefill_supported
|
||||||
logger.info("(%s) prefix caching by default", action)
|
logger.info("(%s) prefix caching by default", action)
|
||||||
|
|
||||||
if not self.enable_chunked_prefill:
|
|
||||||
self.max_num_batched_tokens = model_config.max_model_len
|
|
||||||
|
|
||||||
# V1 should use the new scheduler by default.
|
# V1 should use the new scheduler by default.
|
||||||
# Swap it only if this arg is set to the original V0 default
|
# Swap it only if this arg is set to the original V0 default
|
||||||
if self.scheduler_cls == EngineArgs.scheduler_cls:
|
if self.scheduler_cls == EngineArgs.scheduler_cls:
|
||||||
@ -1692,8 +1689,11 @@ class EngineArgs:
|
|||||||
self.max_num_batched_tokens = \
|
self.max_num_batched_tokens = \
|
||||||
default_max_num_batched_tokens[usage_context]
|
default_max_num_batched_tokens[usage_context]
|
||||||
else:
|
else:
|
||||||
self.max_num_batched_tokens = default_max_num_batched_tokens[
|
if not self.enable_chunked_prefill:
|
||||||
usage_context]
|
self.max_num_batched_tokens = model_config.max_model_len
|
||||||
|
else:
|
||||||
|
self.max_num_batched_tokens = \
|
||||||
|
default_max_num_batched_tokens[usage_context]
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Setting max_num_batched_tokens to %d for %s usage context.",
|
"Setting max_num_batched_tokens to %d for %s usage context.",
|
||||||
self.max_num_batched_tokens, use_context_value)
|
self.max_num_batched_tokens, use_context_value)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user