mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-01 16:57:05 +08:00
[V1] Remove legacy input registry (#15673)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
8693e47e6a
commit
355f66348c
@ -10,7 +10,6 @@ from transformers import PretrainedConfig
|
|||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.image import rescale_image_size
|
from vllm.multimodal.image import rescale_image_size
|
||||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
|
||||||
|
|
||||||
from ....conftest import _ImageAssets
|
from ....conftest import _ImageAssets
|
||||||
from ...utils import build_model_context
|
from ...utils import build_model_context
|
||||||
@ -156,11 +155,7 @@ def test_processor_override(
|
|||||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||||
limit_mm_per_prompt={"image": len(size_factors)},
|
limit_mm_per_prompt={"image": len(size_factors)},
|
||||||
)
|
)
|
||||||
tokenizer = cached_tokenizer_from_config(ctx.model_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
|
||||||
ctx.model_config,
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
)
|
|
||||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||||
|
|
||||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||||
|
|||||||
@ -4,7 +4,6 @@ import pytest
|
|||||||
from transformers import Idefics3Config
|
from transformers import Idefics3Config
|
||||||
|
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
|
||||||
|
|
||||||
from ....conftest import _ImageAssets
|
from ....conftest import _ImageAssets
|
||||||
from ...utils import build_model_context
|
from ...utils import build_model_context
|
||||||
@ -38,11 +37,7 @@ def test_processor_override(
|
|||||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
tokenizer = cached_tokenizer_from_config(ctx.model_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
|
||||||
ctx.model_config,
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
)
|
|
||||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||||
|
|
||||||
# Build the image str / prompt based on the number of images we pass
|
# Build the image str / prompt based on the number of images we pass
|
||||||
|
|||||||
@ -10,7 +10,6 @@ from transformers import PretrainedConfig
|
|||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.image import rescale_image_size
|
from vllm.multimodal.image import rescale_image_size
|
||||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
|
||||||
|
|
||||||
from ....conftest import _ImageAssets
|
from ....conftest import _ImageAssets
|
||||||
from ...utils import build_model_context
|
from ...utils import build_model_context
|
||||||
@ -113,11 +112,7 @@ def test_processor_override(
|
|||||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||||
limit_mm_per_prompt={"image": len(size_factors)},
|
limit_mm_per_prompt={"image": len(size_factors)},
|
||||||
)
|
)
|
||||||
tokenizer = cached_tokenizer_from_config(ctx.model_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
|
||||||
ctx.model_config,
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
)
|
|
||||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||||
|
|
||||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||||
|
|||||||
@ -10,7 +10,6 @@ from pqdm.threads import pqdm
|
|||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.parse import ImageSize
|
from vllm.multimodal.parse import ImageSize
|
||||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
|
||||||
|
|
||||||
from ...utils import build_model_context
|
from ...utils import build_model_context
|
||||||
|
|
||||||
@ -40,10 +39,7 @@ def test_processor_max_tokens(model_id):
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"image": 1},
|
limit_mm_per_prompt={"image": 1},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
ctx.model_config,
|
|
||||||
tokenizer=cached_tokenizer_from_config(ctx.model_config),
|
|
||||||
)
|
|
||||||
info = processor.info
|
info = processor.info
|
||||||
|
|
||||||
seen_aspect_ratios = set[float]()
|
seen_aspect_ratios = set[float]()
|
||||||
@ -139,10 +135,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
ctx.model_config,
|
|
||||||
tokenizer=cached_tokenizer_from_config(ctx.model_config),
|
|
||||||
)
|
|
||||||
|
|
||||||
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
|
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
|
||||||
(488, 183), (2560, 1669)]
|
(488, 183), (2560, 1669)]
|
||||||
@ -168,10 +161,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
ctx.model_config,
|
|
||||||
tokenizer=cached_tokenizer_from_config(ctx.model_config),
|
|
||||||
)
|
|
||||||
|
|
||||||
seen_aspect_ratios = set[float]()
|
seen_aspect_ratios = set[float]()
|
||||||
image_sizes = list[ImageSize]()
|
image_sizes = list[ImageSize]()
|
||||||
|
|||||||
@ -10,7 +10,6 @@ from pqdm.threads import pqdm
|
|||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.parse import ImageSize
|
from vllm.multimodal.parse import ImageSize
|
||||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
|
||||||
|
|
||||||
from ...utils import build_model_context
|
from ...utils import build_model_context
|
||||||
|
|
||||||
@ -41,10 +40,7 @@ def test_processor_max_tokens(model_id):
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"image": 1},
|
limit_mm_per_prompt={"image": 1},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
ctx.model_config,
|
|
||||||
tokenizer=cached_tokenizer_from_config(ctx.model_config),
|
|
||||||
)
|
|
||||||
info = processor.info
|
info = processor.info
|
||||||
|
|
||||||
seen_aspect_ratios = set[float]()
|
seen_aspect_ratios = set[float]()
|
||||||
@ -139,10 +135,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
ctx.model_config,
|
|
||||||
tokenizer=cached_tokenizer_from_config(ctx.model_config),
|
|
||||||
)
|
|
||||||
|
|
||||||
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
|
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
|
||||||
(488, 183), (2560, 1669)]
|
(488, 183), (2560, 1669)]
|
||||||
@ -169,10 +162,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
ctx.model_config,
|
|
||||||
tokenizer=cached_tokenizer_from_config(ctx.model_config),
|
|
||||||
)
|
|
||||||
|
|
||||||
seen_aspect_ratios = set[float]()
|
seen_aspect_ratios = set[float]()
|
||||||
image_sizes = list[ImageSize]()
|
image_sizes = list[ImageSize]()
|
||||||
|
|||||||
@ -3,7 +3,6 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
|
||||||
|
|
||||||
from ....conftest import _ImageAssets
|
from ....conftest import _ImageAssets
|
||||||
from ...utils import build_model_context
|
from ...utils import build_model_context
|
||||||
@ -39,11 +38,7 @@ def test_processor_override(
|
|||||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
tokenizer = cached_tokenizer_from_config(ctx.model_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
|
||||||
ctx.model_config,
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
)
|
|
||||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||||
|
|
||||||
# Build the image str / prompt based on the number of images we pass
|
# Build the image str / prompt based on the number of images we pass
|
||||||
|
|||||||
@ -3,7 +3,6 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
|
||||||
|
|
||||||
from ....conftest import _ImageAssets
|
from ....conftest import _ImageAssets
|
||||||
from ...utils import build_model_context
|
from ...utils import build_model_context
|
||||||
@ -34,11 +33,8 @@ def test_processor_override(
|
|||||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
tokenizer = cached_tokenizer_from_config(ctx.model_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
tokenizer = processor.info.get_tokenizer()
|
||||||
ctx.model_config,
|
|
||||||
tokenizer=tokenizer,
|
|
||||||
)
|
|
||||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||||
|
|
||||||
# Build the image str / prompt based on the number of images we pass
|
# Build the image str / prompt based on the number of images we pass
|
||||||
|
|||||||
@ -28,8 +28,7 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
|
|||||||
replace_token_matches)
|
replace_token_matches)
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
from vllm.multimodal.profiling import MultiModalProfiler
|
from vllm.multimodal.profiling import MultiModalProfiler
|
||||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
cached_tokenizer_from_config)
|
|
||||||
from vllm.utils import full_groupby
|
from vllm.utils import full_groupby
|
||||||
|
|
||||||
from .utils import random_image
|
from .utils import random_image
|
||||||
@ -955,10 +954,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
|
|||||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||||
)
|
)
|
||||||
|
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||||
model_config,
|
|
||||||
tokenizer=cached_tokenizer_from_config(model_config),
|
|
||||||
)
|
|
||||||
profiler = MultiModalProfiler(processor)
|
profiler = MultiModalProfiler(processor)
|
||||||
|
|
||||||
mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
|
mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
|
||||||
@ -994,10 +990,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
|
|||||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||||
)
|
)
|
||||||
|
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||||
model_config,
|
|
||||||
tokenizer=cached_tokenizer_from_config(model_config),
|
|
||||||
)
|
|
||||||
|
|
||||||
rng = np.random.RandomState(0)
|
rng = np.random.RandomState(0)
|
||||||
image = random_image(rng, min_wh=128, max_wh=256)
|
image = random_image(rng, min_wh=128, max_wh=256)
|
||||||
@ -1066,10 +1059,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
|
|||||||
revision=None,
|
revision=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||||
model_config,
|
|
||||||
tokenizer=cached_tokenizer_from_config(model_config),
|
|
||||||
)
|
|
||||||
orig_get_hf_processor = processor.info.get_hf_processor
|
orig_get_hf_processor = processor.info.get_hf_processor
|
||||||
|
|
||||||
def get_hf_processor(self, **kwargs):
|
def get_hf_processor(self, **kwargs):
|
||||||
|
|||||||
@ -261,13 +261,13 @@ class InputPreprocessor:
|
|||||||
# initialized without a tokenizer while using also multi-modal
|
# initialized without a tokenizer while using also multi-modal
|
||||||
# input.
|
# input.
|
||||||
if not self.tokenizer:
|
if not self.tokenizer:
|
||||||
tokenizer = None
|
tokenizer = object() # Dummy
|
||||||
else:
|
else:
|
||||||
tokenizer_group = self.get_tokenizer_group()
|
tokenizer_group = self.get_tokenizer_group()
|
||||||
tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
|
tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
|
||||||
|
|
||||||
mm_processor = self.mm_registry.create_processor(
|
mm_processor = self.mm_registry.create_processor(self.model_config,
|
||||||
self.model_config, tokenizer)
|
tokenizer=tokenizer)
|
||||||
|
|
||||||
if mm_processor_kwargs is None:
|
if mm_processor_kwargs is None:
|
||||||
mm_processor_kwargs = {}
|
mm_processor_kwargs = {}
|
||||||
@ -288,14 +288,14 @@ class InputPreprocessor:
|
|||||||
# initialized without a tokenizer while using also multi-modal
|
# initialized without a tokenizer while using also multi-modal
|
||||||
# input.
|
# input.
|
||||||
if not self.tokenizer:
|
if not self.tokenizer:
|
||||||
tokenizer = None
|
tokenizer = object() # Dummy
|
||||||
else:
|
else:
|
||||||
tokenizer_group = self.get_tokenizer_group()
|
tokenizer_group = self.get_tokenizer_group()
|
||||||
tokenizer = await tokenizer_group.get_lora_tokenizer_async(
|
tokenizer = await tokenizer_group.get_lora_tokenizer_async(
|
||||||
lora_request)
|
lora_request)
|
||||||
|
|
||||||
mm_processor = self.mm_registry.create_processor(
|
mm_processor = self.mm_registry.create_processor(self.model_config,
|
||||||
self.model_config, tokenizer)
|
tokenizer=tokenizer)
|
||||||
if mm_processor_kwargs is None:
|
if mm_processor_kwargs is None:
|
||||||
mm_processor_kwargs = {}
|
mm_processor_kwargs = {}
|
||||||
|
|
||||||
|
|||||||
@ -13,8 +13,7 @@ from typing_extensions import TypeVar, assert_never
|
|||||||
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.transformers_utils.processor import cached_processor_from_config
|
from vllm.transformers_utils.processor import cached_processor_from_config
|
||||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
cached_tokenizer_from_config)
|
|
||||||
from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
|
from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
|
||||||
resolve_mm_processor_kwargs)
|
resolve_mm_processor_kwargs)
|
||||||
|
|
||||||
@ -329,17 +328,27 @@ class InputRegistry:
|
|||||||
from vllm.model_executor.model_loader import get_model_architecture
|
from vllm.model_executor.model_loader import get_model_architecture
|
||||||
from vllm.multimodal import MultiModalKwargs
|
from vllm.multimodal import MultiModalKwargs
|
||||||
from vllm.multimodal.profiling import MultiModalProfiler
|
from vllm.multimodal.profiling import MultiModalProfiler
|
||||||
|
from vllm.sequence import SequenceData
|
||||||
|
|
||||||
if mm_registry.has_processor(model_config):
|
if mm_registry.has_processor(model_config):
|
||||||
tokenizer = cached_tokenizer_from_config(model_config)
|
|
||||||
processor = mm_registry.create_processor(model_config,
|
processor = mm_registry.create_processor(model_config,
|
||||||
tokenizer,
|
|
||||||
disable_cache=True)
|
disable_cache=True)
|
||||||
profiler = MultiModalProfiler(processor)
|
profiler = MultiModalProfiler(processor)
|
||||||
dummy_data_factory = (profiler.get_encoder_dummy_data
|
|
||||||
if is_encoder_data else
|
dummy_data_v1 = (profiler.get_encoder_dummy_data(seq_len)
|
||||||
profiler.get_decoder_dummy_data)
|
if is_encoder_data else
|
||||||
dummy_data = dummy_data_factory(seq_len)
|
profiler.get_decoder_dummy_data(seq_len))
|
||||||
|
_seq_data = SequenceData.from_seqs(
|
||||||
|
dummy_data_v1.prompt_token_ids) # type: ignore[attr-defined]
|
||||||
|
|
||||||
|
dummy_data = DummyData(
|
||||||
|
seq_data=_seq_data,
|
||||||
|
multi_modal_data=getattr(dummy_data_v1, "multi_modal_data",
|
||||||
|
None),
|
||||||
|
multi_modal_placeholders=getattr(dummy_data_v1,
|
||||||
|
"multi_modal_placeholders",
|
||||||
|
None),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
model_cls, _ = get_model_architecture(model_config)
|
model_cls, _ = get_model_architecture(model_config)
|
||||||
if is_encoder_data:
|
if is_encoder_data:
|
||||||
|
|||||||
@ -3,18 +3,18 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from collections.abc import Mapping
|
from collections.abc import Mapping
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Generic, TypeVar, cast
|
from typing import Generic, NamedTuple, TypeVar, cast
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.inputs import DummyData
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
|
from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
|
||||||
MultiModalInputs)
|
MultiModalInputs, MultiModalKwargs,
|
||||||
|
MultiModalPlaceholderDict)
|
||||||
from .processing import BaseMultiModalProcessor, BaseProcessingInfo
|
from .processing import BaseMultiModalProcessor, BaseProcessingInfo
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@ -31,6 +31,20 @@ class ProcessorInputs:
|
|||||||
hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
|
hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
class DummyEncoderData(NamedTuple):
|
||||||
|
"""Dummy data used for profiling."""
|
||||||
|
|
||||||
|
prompt_token_ids: list[int]
|
||||||
|
|
||||||
|
|
||||||
|
class DummyDecoderData(NamedTuple):
|
||||||
|
"""Dummy data used for profiling."""
|
||||||
|
|
||||||
|
prompt_token_ids: list[int]
|
||||||
|
multi_modal_data: MultiModalKwargs
|
||||||
|
multi_modal_placeholders: MultiModalPlaceholderDict
|
||||||
|
|
||||||
|
|
||||||
_I = TypeVar("_I", bound=BaseProcessingInfo)
|
_I = TypeVar("_I", bound=BaseProcessingInfo)
|
||||||
|
|
||||||
|
|
||||||
@ -179,13 +193,7 @@ class MultiModalProfiler(Generic[_I]):
|
|||||||
"tokens.")
|
"tokens.")
|
||||||
return mm_inputs, total_placeholders_by_modality
|
return mm_inputs, total_placeholders_by_modality
|
||||||
|
|
||||||
def get_encoder_dummy_data(
|
def get_encoder_dummy_data(self, seq_len: int) -> DummyEncoderData:
|
||||||
self,
|
|
||||||
seq_len: int,
|
|
||||||
) -> DummyData:
|
|
||||||
# Avoid circular import
|
|
||||||
from vllm.sequence import SequenceData
|
|
||||||
|
|
||||||
mm_inputs, _ = self.get_and_validate_mm_inputs(seq_len)
|
mm_inputs, _ = self.get_and_validate_mm_inputs(seq_len)
|
||||||
mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)
|
mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)
|
||||||
|
|
||||||
@ -197,19 +205,9 @@ class MultiModalProfiler(Generic[_I]):
|
|||||||
num_tokens_to_pad = max(total_len, seq_len) - total_len
|
num_tokens_to_pad = max(total_len, seq_len) - total_len
|
||||||
encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
|
encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
|
||||||
|
|
||||||
return DummyData(
|
return DummyEncoderData(encoder_prompt_token_ids)
|
||||||
seq_data=SequenceData.from_seqs(encoder_prompt_token_ids),
|
|
||||||
multi_modal_data=None,
|
|
||||||
multi_modal_placeholders=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_decoder_dummy_data(
|
|
||||||
self,
|
|
||||||
seq_len: int,
|
|
||||||
) -> DummyData:
|
|
||||||
# Avoid circular import
|
|
||||||
from vllm.sequence import SequenceData
|
|
||||||
|
|
||||||
|
def get_decoder_dummy_data(self, seq_len: int) -> DummyDecoderData:
|
||||||
(mm_inputs, total_placeholders_by_modality
|
(mm_inputs, total_placeholders_by_modality
|
||||||
) = self.get_and_validate_mm_inputs(seq_len)
|
) = self.get_and_validate_mm_inputs(seq_len)
|
||||||
|
|
||||||
@ -231,16 +229,11 @@ class MultiModalProfiler(Generic[_I]):
|
|||||||
"and/or reduce `mm_counts`.", seq_len, total_len,
|
"and/or reduce `mm_counts`.", seq_len, total_len,
|
||||||
total_placeholders_by_modality)
|
total_placeholders_by_modality)
|
||||||
|
|
||||||
return DummyData(
|
if total_len < seq_len:
|
||||||
seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
|
prompt_token_ids.extend([0] * (seq_len - total_len))
|
||||||
multi_modal_data=None,
|
|
||||||
multi_modal_placeholders=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
|
return DummyDecoderData(
|
||||||
|
prompt_token_ids=prompt_token_ids,
|
||||||
return DummyData(
|
|
||||||
seq_data=SequenceData.from_seqs(prompt_token_ids),
|
|
||||||
multi_modal_data=mm_inputs["mm_kwargs"],
|
multi_modal_data=mm_inputs["mm_kwargs"],
|
||||||
multi_modal_placeholders=mm_inputs["mm_placeholders"],
|
multi_modal_placeholders=mm_inputs["mm_placeholders"],
|
||||||
)
|
)
|
||||||
|
|||||||
@ -21,7 +21,8 @@ from .image import ImagePlugin
|
|||||||
from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
|
from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
|
||||||
from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
|
from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
|
||||||
ProcessingCache)
|
ProcessingCache)
|
||||||
from .profiling import BaseDummyInputsBuilder, MultiModalProfiler
|
from .profiling import (BaseDummyInputsBuilder, DummyDecoderData,
|
||||||
|
DummyEncoderData, MultiModalProfiler)
|
||||||
from .video import VideoPlugin
|
from .video import VideoPlugin
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@ -256,10 +257,7 @@ class MultiModalRegistry:
|
|||||||
on underlying model configuration.
|
on underlying model configuration.
|
||||||
"""
|
"""
|
||||||
if self.has_processor(model_config):
|
if self.has_processor(model_config):
|
||||||
tokenizer = cached_tokenizer_from_config(model_config)
|
processor = self.create_processor(model_config, disable_cache=True)
|
||||||
processor = self.create_processor(model_config,
|
|
||||||
tokenizer,
|
|
||||||
disable_cache=True)
|
|
||||||
seq_len = model_config.max_model_len
|
seq_len = model_config.max_model_len
|
||||||
mm_limits = self.get_mm_limits_per_prompt(model_config)
|
mm_limits = self.get_mm_limits_per_prompt(model_config)
|
||||||
return processor.info.get_mm_max_tokens_per_item(
|
return processor.info.get_mm_max_tokens_per_item(
|
||||||
@ -373,10 +371,7 @@ class MultiModalRegistry:
|
|||||||
This should be called after :meth:`init_mm_limits_per_prompt`.
|
This should be called after :meth:`init_mm_limits_per_prompt`.
|
||||||
"""
|
"""
|
||||||
if self.has_processor(model_config):
|
if self.has_processor(model_config):
|
||||||
tokenizer = cached_tokenizer_from_config(model_config)
|
processor = self.create_processor(model_config, disable_cache=True)
|
||||||
processor = self.create_processor(model_config,
|
|
||||||
tokenizer,
|
|
||||||
disable_cache=True)
|
|
||||||
profiler = MultiModalProfiler(processor)
|
profiler = MultiModalProfiler(processor)
|
||||||
return profiler.get_mm_limits()
|
return profiler.get_mm_limits()
|
||||||
|
|
||||||
@ -436,8 +431,8 @@ class MultiModalRegistry:
|
|||||||
def create_processor(
|
def create_processor(
|
||||||
self,
|
self,
|
||||||
model_config: "ModelConfig",
|
model_config: "ModelConfig",
|
||||||
tokenizer: AnyTokenizer,
|
|
||||||
*,
|
*,
|
||||||
|
tokenizer: Optional[AnyTokenizer] = None,
|
||||||
disable_cache: Optional[bool] = None,
|
disable_cache: Optional[bool] = None,
|
||||||
) -> BaseMultiModalProcessor[BaseProcessingInfo]:
|
) -> BaseMultiModalProcessor[BaseProcessingInfo]:
|
||||||
"""
|
"""
|
||||||
@ -446,6 +441,8 @@ class MultiModalRegistry:
|
|||||||
See also:
|
See also:
|
||||||
:ref:`mm-processing`
|
:ref:`mm-processing`
|
||||||
"""
|
"""
|
||||||
|
if tokenizer is None:
|
||||||
|
tokenizer = cached_tokenizer_from_config(model_config)
|
||||||
if disable_cache is None:
|
if disable_cache is None:
|
||||||
disable_cache = model_config.disable_mm_preprocessor_cache
|
disable_cache = model_config.disable_mm_preprocessor_cache
|
||||||
|
|
||||||
@ -456,3 +453,49 @@ class MultiModalRegistry:
|
|||||||
cache = None if disable_cache else self._processing_cache
|
cache = None if disable_cache else self._processing_cache
|
||||||
|
|
||||||
return factories.build_processor(ctx, cache=cache)
|
return factories.build_processor(ctx, cache=cache)
|
||||||
|
|
||||||
|
def get_decoder_dummy_data(
|
||||||
|
self,
|
||||||
|
model_config: "ModelConfig",
|
||||||
|
seq_len: int,
|
||||||
|
) -> DummyDecoderData:
|
||||||
|
"""
|
||||||
|
Create dummy data for profiling the memory usage of a model.
|
||||||
|
|
||||||
|
The model is identified by ``model_config``.
|
||||||
|
"""
|
||||||
|
processor = self.create_processor(model_config, disable_cache=True)
|
||||||
|
profiler = MultiModalProfiler(processor)
|
||||||
|
dummy_data = profiler.get_decoder_dummy_data(seq_len)
|
||||||
|
|
||||||
|
# Having more tokens is over-conservative but otherwise fine
|
||||||
|
token_ids = dummy_data.prompt_token_ids
|
||||||
|
if len(token_ids) < seq_len:
|
||||||
|
raise AssertionError(
|
||||||
|
f"Expected at least {seq_len} dummy tokens for profiling, "
|
||||||
|
f"but found {len(token_ids)} tokens instead.")
|
||||||
|
|
||||||
|
return dummy_data
|
||||||
|
|
||||||
|
def get_encoder_dummy_data(
|
||||||
|
self,
|
||||||
|
model_config: "ModelConfig",
|
||||||
|
seq_len: int,
|
||||||
|
) -> DummyEncoderData:
|
||||||
|
"""
|
||||||
|
Create dummy data for profiling the memory usage of a model.
|
||||||
|
|
||||||
|
The model is identified by ``model_config``.
|
||||||
|
"""
|
||||||
|
processor = self.create_processor(model_config, disable_cache=True)
|
||||||
|
profiler = MultiModalProfiler(processor)
|
||||||
|
dummy_data = profiler.get_encoder_dummy_data(seq_len)
|
||||||
|
|
||||||
|
# Having more tokens is over-conservative but otherwise fine
|
||||||
|
token_ids = dummy_data.prompt_token_ids
|
||||||
|
if len(token_ids) < seq_len:
|
||||||
|
logger.warning_once(
|
||||||
|
f"Expected at least {seq_len} dummy encoder tokens for "
|
||||||
|
f"profiling, but found {len(token_ids)} tokens instead.")
|
||||||
|
|
||||||
|
return dummy_data
|
||||||
|
|||||||
@ -14,10 +14,11 @@ from vllm.config import ModelConfig, VllmConfig
|
|||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
|
from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
|
||||||
from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
|
from vllm.inputs import PromptType
|
||||||
from vllm.inputs.preprocess import InputPreprocessor
|
from vllm.inputs.preprocess import InputPreprocessor
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
|
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||||
@ -48,7 +49,7 @@ class AsyncLLM(EngineClient):
|
|||||||
executor_class: type[Executor],
|
executor_class: type[Executor],
|
||||||
log_stats: bool,
|
log_stats: bool,
|
||||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||||
input_registry: InputRegistry = INPUT_REGISTRY,
|
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||||
use_cached_outputs: bool = False,
|
use_cached_outputs: bool = False,
|
||||||
log_requests: bool = True,
|
log_requests: bool = True,
|
||||||
start_engine_loop: bool = True,
|
start_engine_loop: bool = True,
|
||||||
@ -90,7 +91,7 @@ class AsyncLLM(EngineClient):
|
|||||||
self.processor = Processor(
|
self.processor = Processor(
|
||||||
vllm_config=vllm_config,
|
vllm_config=vllm_config,
|
||||||
tokenizer=self.tokenizer,
|
tokenizer=self.tokenizer,
|
||||||
input_registry=input_registry,
|
mm_registry=mm_registry,
|
||||||
)
|
)
|
||||||
|
|
||||||
# OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
|
# OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
|
||||||
|
|||||||
@ -11,7 +11,7 @@ from vllm.config import ParallelConfig, VllmConfig
|
|||||||
from vllm.distributed import stateless_destroy_torch_distributed_process_group
|
from vllm.distributed import stateless_destroy_torch_distributed_process_group
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.engine.metrics_types import StatLoggerBase
|
from vllm.engine.metrics_types import StatLoggerBase
|
||||||
from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
|
from vllm.inputs import PromptType
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||||
@ -44,7 +44,6 @@ class LLMEngine:
|
|||||||
log_stats: bool,
|
log_stats: bool,
|
||||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||||
stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
|
stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
|
||||||
input_registry: InputRegistry = INPUT_REGISTRY,
|
|
||||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||||
use_cached_outputs: bool = False,
|
use_cached_outputs: bool = False,
|
||||||
multiprocess_mode: bool = False,
|
multiprocess_mode: bool = False,
|
||||||
@ -80,7 +79,6 @@ class LLMEngine:
|
|||||||
# Processor (convert Inputs --> EngineCoreRequests)
|
# Processor (convert Inputs --> EngineCoreRequests)
|
||||||
self.processor = Processor(vllm_config=vllm_config,
|
self.processor = Processor(vllm_config=vllm_config,
|
||||||
tokenizer=self.tokenizer,
|
tokenizer=self.tokenizer,
|
||||||
input_registry=input_registry,
|
|
||||||
mm_registry=mm_registry)
|
mm_registry=mm_registry)
|
||||||
|
|
||||||
# OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
|
# OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
|
||||||
|
|||||||
@ -5,8 +5,7 @@ from collections.abc import Mapping
|
|||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
|
from vllm.inputs import ProcessorInputs, PromptType
|
||||||
PromptType, SingletonInputsAdapter)
|
|
||||||
from vllm.inputs.parse import split_enc_dec_inputs
|
from vllm.inputs.parse import split_enc_dec_inputs
|
||||||
from vllm.inputs.preprocess import InputPreprocessor
|
from vllm.inputs.preprocess import InputPreprocessor
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
@ -31,7 +30,6 @@ class Processor:
|
|||||||
self,
|
self,
|
||||||
vllm_config: VllmConfig,
|
vllm_config: VllmConfig,
|
||||||
tokenizer: BaseTokenizerGroup,
|
tokenizer: BaseTokenizerGroup,
|
||||||
input_registry: InputRegistry = INPUT_REGISTRY,
|
|
||||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||||
):
|
):
|
||||||
|
|
||||||
@ -210,7 +208,6 @@ class Processor:
|
|||||||
self._validate_model_inputs(processed_inputs, lora_request)
|
self._validate_model_inputs(processed_inputs, lora_request)
|
||||||
|
|
||||||
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
|
encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
|
||||||
decoder_inputs = SingletonInputsAdapter(decoder_inputs)
|
|
||||||
|
|
||||||
# TODO: Impl encoder-decoder
|
# TODO: Impl encoder-decoder
|
||||||
if encoder_inputs is not None:
|
if encoder_inputs is not None:
|
||||||
@ -221,8 +218,9 @@ class Processor:
|
|||||||
sampling_params = params.clone()
|
sampling_params = params.clone()
|
||||||
# If unset max tokens, then generate up to the max_model_len.
|
# If unset max tokens, then generate up to the max_model_len.
|
||||||
if sampling_params.max_tokens is None:
|
if sampling_params.max_tokens is None:
|
||||||
sampling_params.max_tokens = (self.model_config.max_model_len -
|
sampling_params.max_tokens = (
|
||||||
len(decoder_inputs.prompt_token_ids))
|
self.model_config.max_model_len -
|
||||||
|
len(decoder_inputs["prompt_token_ids"]))
|
||||||
sampling_params.update_from_generation_config(
|
sampling_params.update_from_generation_config(
|
||||||
self.generation_config_fields, eos_token_id)
|
self.generation_config_fields, eos_token_id)
|
||||||
sampling_params.update_from_tokenizer(
|
sampling_params.update_from_tokenizer(
|
||||||
@ -232,8 +230,8 @@ class Processor:
|
|||||||
sorted_mm_inputs: Optional[list[MultiModalKwargs]] = None
|
sorted_mm_inputs: Optional[list[MultiModalKwargs]] = None
|
||||||
sorted_mm_positions: Optional[list[PlaceholderRange]] = None
|
sorted_mm_positions: Optional[list[PlaceholderRange]] = None
|
||||||
sorted_mm_hashes: Optional[list[str]] = None
|
sorted_mm_hashes: Optional[list[str]] = None
|
||||||
if (decoder_mm_inputs := decoder_inputs.multi_modal_data):
|
if decoder_inputs["type"] == "multimodal":
|
||||||
assert isinstance(decoder_mm_inputs, MultiModalKwargs)
|
decoder_mm_inputs = decoder_inputs["mm_kwargs"]
|
||||||
|
|
||||||
# The output of merged multi-modal processor (`decoder_mm_inputs`)
|
# The output of merged multi-modal processor (`decoder_mm_inputs`)
|
||||||
# contains the kwargs for all items from all modalities.
|
# contains the kwargs for all items from all modalities.
|
||||||
@ -254,8 +252,8 @@ class Processor:
|
|||||||
sorted_mm_positions,
|
sorted_mm_positions,
|
||||||
sorted_mm_hashes,
|
sorted_mm_hashes,
|
||||||
) = merge_and_sort_multimodal_metadata(
|
) = merge_and_sort_multimodal_metadata(
|
||||||
decoder_inputs.multi_modal_placeholders,
|
decoder_inputs["mm_placeholders"],
|
||||||
decoder_inputs.multi_modal_hashes if self.use_hash else None,
|
decoder_inputs["mm_hashes"] if self.use_hash else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
# NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
|
# NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
|
||||||
@ -281,8 +279,8 @@ class Processor:
|
|||||||
|
|
||||||
return EngineCoreRequest(
|
return EngineCoreRequest(
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
prompt=decoder_inputs.prompt,
|
prompt=decoder_inputs.get("prompt"),
|
||||||
prompt_token_ids=decoder_inputs.prompt_token_ids,
|
prompt_token_ids=decoder_inputs["prompt_token_ids"],
|
||||||
mm_inputs=sorted_mm_inputs,
|
mm_inputs=sorted_mm_inputs,
|
||||||
mm_hashes=sorted_mm_hashes,
|
mm_hashes=sorted_mm_hashes,
|
||||||
mm_placeholders=sorted_mm_positions,
|
mm_placeholders=sorted_mm_positions,
|
||||||
|
|||||||
@ -15,7 +15,6 @@ from vllm.attention.layer import Attention
|
|||||||
from vllm.config import CompilationLevel, VllmConfig
|
from vllm.config import CompilationLevel, VllmConfig
|
||||||
from vllm.distributed.parallel_state import get_pp_group, graph_capture
|
from vllm.distributed.parallel_state import get_pp_group, graph_capture
|
||||||
from vllm.forward_context import set_forward_context
|
from vllm.forward_context import set_forward_context
|
||||||
from vllm.inputs import INPUT_REGISTRY
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||||
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
|
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
|
||||||
@ -130,7 +129,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
self.cascade_attn_enabled = not self.model_config.disable_cascade_attn
|
self.cascade_attn_enabled = not self.model_config.disable_cascade_attn
|
||||||
|
|
||||||
# Multi-modal data support
|
# Multi-modal data support
|
||||||
self.input_registry = INPUT_REGISTRY
|
|
||||||
self.mm_registry = MULTIMODAL_REGISTRY
|
self.mm_registry = MULTIMODAL_REGISTRY
|
||||||
self.uses_mrope = model_config.uses_mrope
|
self.uses_mrope = model_config.uses_mrope
|
||||||
|
|
||||||
@ -1473,16 +1471,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
encoder_budget, max_num_mm_items, dummy_data_modality)
|
encoder_budget, max_num_mm_items, dummy_data_modality)
|
||||||
|
|
||||||
# Create dummy batch of multimodal inputs.
|
# Create dummy batch of multimodal inputs.
|
||||||
dummy_request_data = self.input_registry.dummy_data_for_profiling(
|
dummy_request_data = self.mm_registry.get_decoder_dummy_data(
|
||||||
model_config=self.model_config,
|
model_config=self.model_config,
|
||||||
seq_len=self.max_num_tokens,
|
seq_len=self.max_num_tokens,
|
||||||
mm_registry=self.mm_registry,
|
|
||||||
)
|
)
|
||||||
dummy_mm_data = dummy_request_data.multi_modal_data
|
dummy_mm_data = dummy_request_data.multi_modal_data
|
||||||
if not isinstance(dummy_mm_data, MultiModalKwargs):
|
|
||||||
# TODO: Delete this check once input mapper is fully removed.
|
|
||||||
raise RuntimeError(
|
|
||||||
"Legacy input mapper is not supported in V1")
|
|
||||||
|
|
||||||
# Dummy data definition may contain multiple multimodal items
|
# Dummy data definition may contain multiple multimodal items
|
||||||
# (e.g, multiple images) for a single request, therefore here we
|
# (e.g, multiple images) for a single request, therefore here we
|
||||||
|
|||||||
@ -17,7 +17,6 @@ from vllm.attention.backends.abstract import AttentionType
|
|||||||
from vllm.attention.layer import Attention
|
from vllm.attention.layer import Attention
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.forward_context import set_forward_context
|
from vllm.forward_context import set_forward_context
|
||||||
from vllm.inputs import INPUT_REGISTRY
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.model_loader import get_model
|
from vllm.model_executor.model_loader import get_model
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
||||||
@ -102,7 +101,6 @@ class TPUModelRunner:
|
|||||||
self.hidden_size = model_config.get_hidden_size()
|
self.hidden_size = model_config.get_hidden_size()
|
||||||
|
|
||||||
# Multi-modal data support
|
# Multi-modal data support
|
||||||
self.input_registry = INPUT_REGISTRY
|
|
||||||
self.mm_registry = MULTIMODAL_REGISTRY
|
self.mm_registry = MULTIMODAL_REGISTRY
|
||||||
self.uses_mrope = model_config.uses_mrope
|
self.uses_mrope = model_config.uses_mrope
|
||||||
# TODO: Support M-RoPE (e.g, Qwen2-VL)
|
# TODO: Support M-RoPE (e.g, Qwen2-VL)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user