mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-25 09:07:04 +08:00
reformat commits
This commit is contained in:
parent
1bceb28678
commit
4e558858b8
@ -7,16 +7,16 @@
|
|||||||
# LICENSE is in root directory.
|
# LICENSE is in root directory.
|
||||||
# --------------------------------------------------------
|
# --------------------------------------------------------
|
||||||
|
|
||||||
|
import copy
|
||||||
import math
|
import math
|
||||||
import random
|
import random
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
import copy
|
|
||||||
import warnings
|
import warnings
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from collections.abc import Iterable, Mapping, Sequence, Callable
|
from collections.abc import Callable, Iterable, Mapping, Sequence
|
||||||
|
from dataclasses import dataclass
|
||||||
from typing import Annotated, Any, Literal, TypeAlias, TypeVar
|
from typing import Annotated, Any, Literal, TypeAlias, TypeVar
|
||||||
|
|
||||||
|
import einops
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
import regex as re
|
import regex as re
|
||||||
import torch
|
import torch
|
||||||
@ -24,7 +24,6 @@ import torch.nn as nn
|
|||||||
import torchvision.transforms as T
|
import torchvision.transforms as T
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from transformers import BatchFeature, PretrainedConfig, TensorType
|
from transformers import BatchFeature, PretrainedConfig, TensorType
|
||||||
import einops
|
|
||||||
|
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
|
from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
|
||||||
@ -181,7 +180,8 @@ class DynamicResolutionImageTilingStrategy:
|
|||||||
thumbnail_area = self._thumbnail_size * self._thumbnail_size
|
thumbnail_area = self._thumbnail_size * self._thumbnail_size
|
||||||
area_ratio = resized_area / thumbnail_area
|
area_ratio = resized_area / thumbnail_area
|
||||||
|
|
||||||
# Only add thumbnail if resized image area is less than threshold % of thumbnail area
|
# Only add thumbnail if resized image area is less than threshold % of
|
||||||
|
# thumbnail area
|
||||||
if area_ratio < self._thumbnail_area_threshold:
|
if area_ratio < self._thumbnail_area_threshold:
|
||||||
thumbnail_img = params.image.resize(
|
thumbnail_img = params.image.resize(
|
||||||
(self._thumbnail_size, self._thumbnail_size)
|
(self._thumbnail_size, self._thumbnail_size)
|
||||||
@ -198,11 +198,11 @@ class DynamicResolutionImageTilingStrategy:
|
|||||||
tiling_augment_prob: float = 0.4,
|
tiling_augment_prob: float = 0.4,
|
||||||
) -> DynamicResolutionParams:
|
) -> DynamicResolutionParams:
|
||||||
"""Process a single media item and return its parameters.
|
"""Process a single media item and return its parameters.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
media: The media item to process
|
media: The media item to process
|
||||||
num_tokens_available: Number of tokens available for this media
|
num_tokens_available: Number of tokens available for this media
|
||||||
data_augment: Whether to apply data augmentation to the image. Defaults to False.
|
data_augment: Whether to apply data augmentation to the image. Defaults to
|
||||||
|
False.
|
||||||
Returns:
|
Returns:
|
||||||
DynamicResolutionParams for the media
|
DynamicResolutionParams for the media
|
||||||
"""
|
"""
|
||||||
@ -222,7 +222,8 @@ class DynamicResolutionImageTilingStrategy:
|
|||||||
target_patch_height = math.floor(factor * closest_patch_height)
|
target_patch_height = math.floor(factor * closest_patch_height)
|
||||||
target_patch_width = math.floor(factor * closest_patch_width)
|
target_patch_width = math.floor(factor * closest_patch_width)
|
||||||
|
|
||||||
# We only consider self._min_num_patches if it is greater than current_num_tokens_available.
|
# We only consider self._min_num_patches if it is greater than
|
||||||
|
# current_num_tokens_available.
|
||||||
if (
|
if (
|
||||||
current_num_tokens_available > self._min_num_patches
|
current_num_tokens_available > self._min_num_patches
|
||||||
and target_patch_height * target_patch_width < self._min_num_patches
|
and target_patch_height * target_patch_width < self._min_num_patches
|
||||||
@ -244,7 +245,8 @@ class DynamicResolutionImageTilingStrategy:
|
|||||||
new_patch_width = math.ceil(up_factor * target_patch_width)
|
new_patch_width = math.ceil(up_factor * target_patch_width)
|
||||||
|
|
||||||
if new_patch_height * new_patch_width > current_num_tokens_available:
|
if new_patch_height * new_patch_width > current_num_tokens_available:
|
||||||
# If only one side can be min_side, make as big as possible at native aspect ratio while staying below max_patches
|
# If only one side can be min_side, make as big as possible at
|
||||||
|
# native aspect ratio while staying below max_patches
|
||||||
if (
|
if (
|
||||||
max(current_num_tokens_available // new_patch_width, 1)
|
max(current_num_tokens_available // new_patch_width, 1)
|
||||||
* self._patch_size
|
* self._patch_size
|
||||||
@ -271,7 +273,8 @@ class DynamicResolutionImageTilingStrategy:
|
|||||||
new_patch_width = math.ceil(up_factor * target_patch_width)
|
new_patch_width = math.ceil(up_factor * target_patch_width)
|
||||||
|
|
||||||
if new_patch_height * new_patch_width > current_num_tokens_available:
|
if new_patch_height * new_patch_width > current_num_tokens_available:
|
||||||
# If only one side can be min_side, make as big as possible at native aspect ratio while staying below max_patches
|
# If only one side can be min_side, make as big as possible at
|
||||||
|
# native aspect ratio while staying below max_patches
|
||||||
if (
|
if (
|
||||||
max(current_num_tokens_available // new_patch_height, 1)
|
max(current_num_tokens_available // new_patch_height, 1)
|
||||||
* self._patch_size
|
* self._patch_size
|
||||||
@ -355,7 +358,8 @@ class DynamicResolutionImageTilingStrategy:
|
|||||||
thumbnail_area = self._thumbnail_size * self._thumbnail_size
|
thumbnail_area = self._thumbnail_size * self._thumbnail_size
|
||||||
area_ratio = resized_area / thumbnail_area
|
area_ratio = resized_area / thumbnail_area
|
||||||
|
|
||||||
# Only add thumbnail if resized image area is less than threshold % of thumbnail area
|
# Only add thumbnail if resized image area is less than threshold % of
|
||||||
|
# thumbnail area
|
||||||
if area_ratio < self._thumbnail_area_threshold:
|
if area_ratio < self._thumbnail_area_threshold:
|
||||||
num_tiles += 1 # Add 1 for thumbnail
|
num_tiles += 1 # Add 1 for thumbnail
|
||||||
# Add embeddings for thumbnail (thumbnail_size x thumbnail_size)
|
# Add embeddings for thumbnail (thumbnail_size x thumbnail_size)
|
||||||
@ -435,7 +439,8 @@ class DynamicResolutionImageTilingStrategy:
|
|||||||
media_list: List of media items to process
|
media_list: List of media items to process
|
||||||
num_tokens_available: Total number of tokens available across all media
|
num_tokens_available: Total number of tokens available across all media
|
||||||
max_num_tiles: Maximum number of tiles (unused in this implementation)
|
max_num_tiles: Maximum number of tiles (unused in this implementation)
|
||||||
data_augment: Whether to apply data augmentation to the image. Defaults to False.
|
data_augment: Whether to apply data augmentation to the image. Defaults to
|
||||||
|
False.
|
||||||
Returns:
|
Returns:
|
||||||
List of ImageTilingParams for each media item
|
List of ImageTilingParams for each media item
|
||||||
"""
|
"""
|
||||||
@ -444,19 +449,21 @@ class DynamicResolutionImageTilingStrategy:
|
|||||||
* (4 if self._pixel_shuffle else 1)
|
* (4 if self._pixel_shuffle else 1)
|
||||||
* (4 if self._conv_merging else 1)
|
* (4 if self._conv_merging else 1)
|
||||||
)
|
)
|
||||||
# When the number of available token is too small, allow self._min_num_patches per media and
|
# When the number of available token is too small, allow self._min_num_patches
|
||||||
# let the sample be truncated.
|
# per media and let the sample be truncated.
|
||||||
num_tokens_available = max(
|
num_tokens_available = max(
|
||||||
num_tokens_available, self._min_num_patches * len(media_list)
|
num_tokens_available, self._min_num_patches * len(media_list)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Clip the number of tokens available per media to be between min and max patches.
|
# Clip the number of tokens available per media to be between min and max
|
||||||
|
# patches.
|
||||||
num_tokens_available_per_media = [
|
num_tokens_available_per_media = [
|
||||||
max(min(num_tokens_available, self._max_num_patches), self._min_num_patches)
|
max(min(num_tokens_available, self._max_num_patches), self._min_num_patches)
|
||||||
for _ in range(len(media_list))
|
for _ in range(len(media_list))
|
||||||
]
|
]
|
||||||
|
|
||||||
# In theory this could be a while True loop, but in case the process_media method slightly
|
# In theory this could be a while True loop, but in case the process_media
|
||||||
|
# method slightly
|
||||||
# changes, I want to make sure we don't get stuck in an infinite loop.
|
# changes, I want to make sure we don't get stuck in an infinite loop.
|
||||||
for _ in range(10):
|
for _ in range(10):
|
||||||
# Step 1: Process each media with current token budget
|
# Step 1: Process each media with current token budget
|
||||||
@ -496,8 +503,8 @@ class DynamicResolutionImageTilingStrategy:
|
|||||||
for i in range(len(num_tokens_available_per_media))
|
for i in range(len(num_tokens_available_per_media))
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
# If there was not scaling down, we're stuck just use min_num_patches per media, else
|
# If there was not scaling down, we're stuck just use min_num_patches per
|
||||||
# try with the scaled down num_tokens_available_per_media.
|
# media, else try with the scaled down num_tokens_available_per_media.
|
||||||
if not scaled_down:
|
if not scaled_down:
|
||||||
num_tokens_available_per_media = [self._min_num_patches] * len(
|
num_tokens_available_per_media = [self._min_num_patches] * len(
|
||||||
media_list
|
media_list
|
||||||
@ -558,8 +565,15 @@ class DynamicResolutionImageTilingStrategy:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f"DynamicResolutionImageTransform(vision_model_type={self._vision_model_type}, min_num_patches={self._min_num_patches}, patch_size={self._patch_size}, pixel_shuffle={self._pixel_shuffle}, conv_merging={self._conv_merging}, use_thumbnail={self._use_thumbnail}, thumbnail_size={self._thumbnail_size}, thumbnail_area_threshold={self._thumbnail_area_threshold})"
|
return f"DynamicResolutionImageTransform(\
|
||||||
|
vision_model_type={self._vision_model_type}, \
|
||||||
|
min_num_patches={self._min_num_patches}, \
|
||||||
|
patch_size={self._patch_size}, \
|
||||||
|
pixel_shuffle={self._pixel_shuffle}, \
|
||||||
|
conv_merging={self._conv_merging}, \
|
||||||
|
use_thumbnail={self._use_thumbnail}, \
|
||||||
|
thumbnail_size={self._thumbnail_size}, \
|
||||||
|
thumbnail_area_threshold={self._thumbnail_area_threshold})"
|
||||||
|
|
||||||
|
|
||||||
image_tiling_strategy = DynamicResolutionImageTilingStrategy(
|
image_tiling_strategy = DynamicResolutionImageTilingStrategy(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user