mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-12 00:25:36 +08:00
[Bugfix]: Clean up chunked prefill logging when using whisper (#25075)
Signed-off-by: simondanielsson <simon.danielsson99@hotmail.com>
This commit is contained in:
parent
2e1b8bc2b6
commit
e23cacda35
@ -1,5 +1,6 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import dataclasses
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from unittest.mock import Mock
|
from unittest.mock import Mock
|
||||||
|
|
||||||
@ -1899,4 +1900,53 @@ def test_priority_scheduling_preemption_when_out_of_kv():
|
|||||||
assert output.scheduled_cached_reqs.num_reqs == 1
|
assert output.scheduled_cached_reqs.num_reqs == 1
|
||||||
assert output.scheduled_cached_reqs.req_ids[0] == request_high.request_id
|
assert output.scheduled_cached_reqs.req_ids[0] == request_high.request_id
|
||||||
assert len(scheduler.waiting) == 1
|
assert len(scheduler.waiting) == 1
|
||||||
assert len(scheduler.running) == 1
|
assert len(scheduler.running) == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("enable_chunked_prefill", "is_encoder_decoder", "expect_enabled"),
|
||||||
|
[
|
||||||
|
(True, False, True),
|
||||||
|
(False, False, False),
|
||||||
|
# Encoder-decoder models should always have it disabled
|
||||||
|
(False, True, False),
|
||||||
|
(True, True, False),
|
||||||
|
])
|
||||||
|
def test_chunked_prefill_disabled_for_encoder_decoder(
|
||||||
|
enable_chunked_prefill: bool, is_encoder_decoder: bool,
|
||||||
|
expect_enabled: bool) -> None:
|
||||||
|
"""Validate that chunked prefill is appropriately disabled for
|
||||||
|
encoder-decoder models."""
|
||||||
|
scheduler_config = SchedulerConfig(
|
||||||
|
enable_chunked_prefill=enable_chunked_prefill,
|
||||||
|
is_encoder_decoder=is_encoder_decoder,
|
||||||
|
)
|
||||||
|
|
||||||
|
# `is_encoder_decoder` should only be used during construction
|
||||||
|
# of the config, and otherwise stored in the model config.
|
||||||
|
assert "is_encoder_decoder" not in vars(scheduler_config)
|
||||||
|
assert "is_encoder_decoder" not in [
|
||||||
|
f.name for f in dataclasses.fields(scheduler_config)
|
||||||
|
]
|
||||||
|
_validate_chunked_prefill_settings_for_encoder_decoder(
|
||||||
|
scheduler_config, is_encoder_decoder, expect_enabled)
|
||||||
|
|
||||||
|
# Ensure it is retained in VllmConfig, even after its post-init.
|
||||||
|
vllm_config = VllmConfig(scheduler_config=scheduler_config)
|
||||||
|
_validate_chunked_prefill_settings_for_encoder_decoder(
|
||||||
|
vllm_config.scheduler_config, is_encoder_decoder, expect_enabled)
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_chunked_prefill_settings_for_encoder_decoder(
|
||||||
|
scheduler_config: SchedulerConfig, is_encoder_decoder: bool,
|
||||||
|
expect_enabled: bool) -> None:
|
||||||
|
"""Validate chunked prefill settings in the scheduler config for
|
||||||
|
encoder-decoder models."""
|
||||||
|
assert scheduler_config.chunked_prefill_enabled is expect_enabled
|
||||||
|
assert scheduler_config.enable_chunked_prefill is expect_enabled
|
||||||
|
if is_encoder_decoder:
|
||||||
|
# Encoder-decoder models should automatically disable chunked multimodal
|
||||||
|
# inputs as well
|
||||||
|
assert scheduler_config.disable_chunked_mm_input is not expect_enabled
|
||||||
|
if is_encoder_decoder and not expect_enabled:
|
||||||
|
assert scheduler_config.long_prefill_token_threshold == 0
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
from dataclasses import field
|
from dataclasses import InitVar, field
|
||||||
from typing import Any, Literal, Union
|
from typing import Any, Literal, Union
|
||||||
|
|
||||||
from pydantic import SkipValidation, model_validator
|
from pydantic import SkipValidation, model_validator
|
||||||
@ -84,6 +84,13 @@ class SchedulerConfig:
|
|||||||
is_multimodal_model: bool = False
|
is_multimodal_model: bool = False
|
||||||
"""True if the model is multimodal."""
|
"""True if the model is multimodal."""
|
||||||
|
|
||||||
|
is_encoder_decoder: InitVar[bool] = False
|
||||||
|
"""True if the model is an encoder-decoder model.
|
||||||
|
|
||||||
|
Note: This is stored in the ModelConfig, and is used only here to
|
||||||
|
disable chunked prefill and prefix caching for encoder-decoder models.
|
||||||
|
"""
|
||||||
|
|
||||||
# TODO (ywang96): Make this configurable.
|
# TODO (ywang96): Make this configurable.
|
||||||
max_num_encoder_input_tokens: int = field(init=False)
|
max_num_encoder_input_tokens: int = field(init=False)
|
||||||
"""Multimodal encoder compute budget, only used in V1.
|
"""Multimodal encoder compute budget, only used in V1.
|
||||||
@ -161,13 +168,23 @@ class SchedulerConfig:
|
|||||||
usedforsecurity=False).hexdigest()
|
usedforsecurity=False).hexdigest()
|
||||||
return hash_str
|
return hash_str
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
def __post_init__(self, is_encoder_decoder: bool) -> None:
|
||||||
if self.max_model_len is None:
|
if self.max_model_len is None:
|
||||||
self.max_model_len = 8192
|
self.max_model_len = 8192
|
||||||
|
|
||||||
if self.max_num_seqs is None:
|
if self.max_num_seqs is None:
|
||||||
self.max_num_seqs = 128
|
self.max_num_seqs = 128
|
||||||
|
|
||||||
|
if is_encoder_decoder:
|
||||||
|
# Chunked prefill should be disabled for encoder-decoder models.
|
||||||
|
self.disable_chunked_mm_input = True
|
||||||
|
self.chunked_prefill_enabled = False
|
||||||
|
self.enable_chunked_prefill = False
|
||||||
|
self.long_prefill_token_threshold = 0
|
||||||
|
logger.info(
|
||||||
|
"Encoder-decoder models do not support chunked prefill nor"
|
||||||
|
" prefix caching; disabling both.")
|
||||||
|
|
||||||
if self.max_num_batched_tokens is None:
|
if self.max_num_batched_tokens is None:
|
||||||
if self.enable_chunked_prefill:
|
if self.enable_chunked_prefill:
|
||||||
self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS
|
self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS
|
||||||
|
|||||||
@ -386,10 +386,6 @@ class VllmConfig:
|
|||||||
"Encoder-decoder model detected: setting "
|
"Encoder-decoder model detected: setting "
|
||||||
"`max_num_encoder_input_tokens` to encoder length (%s)",
|
"`max_num_encoder_input_tokens` to encoder length (%s)",
|
||||||
self.scheduler_config.max_num_encoder_input_tokens)
|
self.scheduler_config.max_num_encoder_input_tokens)
|
||||||
self.scheduler_config.disable_chunked_mm_input = True
|
|
||||||
disable_chunked_prefill_reasons.append(
|
|
||||||
"Encoder-decoder models do not support chunked prefill nor"
|
|
||||||
" prefix caching; disabling both.")
|
|
||||||
if (self.model_config.architecture
|
if (self.model_config.architecture
|
||||||
== "WhisperForConditionalGeneration"
|
== "WhisperForConditionalGeneration"
|
||||||
and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
|
and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
|
||||||
@ -400,7 +396,10 @@ class VllmConfig:
|
|||||||
"try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
|
"try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
|
||||||
"to 'spawn'.")
|
"to 'spawn'.")
|
||||||
|
|
||||||
if disable_chunked_prefill_reasons:
|
# Disable prefix caching only if chunked prefill is explicitly disabled
|
||||||
|
# (and not merely unset)
|
||||||
|
if (self.scheduler_config.chunked_prefill_enabled is False
|
||||||
|
or disable_chunked_prefill_reasons):
|
||||||
for reason in disable_chunked_prefill_reasons:
|
for reason in disable_chunked_prefill_reasons:
|
||||||
logger.info(reason)
|
logger.info(reason)
|
||||||
self.scheduler_config.chunked_prefill_enabled = False
|
self.scheduler_config.chunked_prefill_enabled = False
|
||||||
|
|||||||
@ -1367,6 +1367,7 @@ class EngineArgs:
|
|||||||
enable_chunked_prefill=self.enable_chunked_prefill,
|
enable_chunked_prefill=self.enable_chunked_prefill,
|
||||||
disable_chunked_mm_input=self.disable_chunked_mm_input,
|
disable_chunked_mm_input=self.disable_chunked_mm_input,
|
||||||
is_multimodal_model=model_config.is_multimodal_model,
|
is_multimodal_model=model_config.is_multimodal_model,
|
||||||
|
is_encoder_decoder=model_config.is_encoder_decoder,
|
||||||
send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
|
send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
|
||||||
and parallel_config.use_ray),
|
and parallel_config.use_ray),
|
||||||
policy=self.scheduling_policy,
|
policy=self.scheduling_policy,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user