mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-12 08:35:01 +08:00
[VLM] Clean up models (#16873)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
5124f5bf51
commit
205d84aaa9
@ -62,6 +62,7 @@ def run_simple_demo(args: argparse.Namespace):
|
|||||||
tokenizer_mode="mistral" if args.format == "mistral" else "auto",
|
tokenizer_mode="mistral" if args.format == "mistral" else "auto",
|
||||||
config_format="mistral" if args.format == "mistral" else "auto",
|
config_format="mistral" if args.format == "mistral" else "auto",
|
||||||
load_format="mistral" if args.format == "mistral" else "auto",
|
load_format="mistral" if args.format == "mistral" else "auto",
|
||||||
|
limit_mm_per_prompt={"image": 1},
|
||||||
max_model_len=4096,
|
max_model_len=4096,
|
||||||
max_num_seqs=2,
|
max_num_seqs=2,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
|
|||||||
@ -957,7 +957,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
|
|||||||
"max_pixels": 1280 * 28 * 28,
|
"max_pixels": 1280 * 28 * 28,
|
||||||
"fps": [1],
|
"fps": [1],
|
||||||
},
|
},
|
||||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
limit_mm_per_prompt={"image": 1},
|
||||||
)
|
)
|
||||||
|
|
||||||
if modality == "image":
|
if modality == "image":
|
||||||
|
|||||||
@ -503,26 +503,6 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
|
|||||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||||
return {"audio": None, "image": None}
|
return {"audio": None, "image": None}
|
||||||
|
|
||||||
def get_mm_max_tokens_per_item(
|
|
||||||
self,
|
|
||||||
seq_len: int,
|
|
||||||
mm_counts: Mapping[str, int],
|
|
||||||
) -> Mapping[str, int]:
|
|
||||||
return {
|
|
||||||
"image": self.get_max_image_tokens(),
|
|
||||||
"audio": self.get_max_audio_tokens(),
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_max_audio_tokens(self) -> int:
|
|
||||||
sr = self.get_feature_extractor().sampling_rate
|
|
||||||
num_frames = self.get_audio_num_frames(_AUDIO_MAX_SOUNDFILE_SIZE, sr)
|
|
||||||
return self._compute_audio_embed_size(num_frames)
|
|
||||||
|
|
||||||
def get_max_image_tokens(self) -> int:
|
|
||||||
target_width, target_height = self.get_image_size_with_most_features()
|
|
||||||
return self.get_num_image_tokens(image_width=target_width,
|
|
||||||
image_height=target_height)
|
|
||||||
|
|
||||||
def _find_target_aspect_ratio(
|
def _find_target_aspect_ratio(
|
||||||
self,
|
self,
|
||||||
orig_width: int,
|
orig_width: int,
|
||||||
@ -764,9 +744,6 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
|
|||||||
num_audios = mm_counts.get("audio", 0)
|
num_audios = mm_counts.get("audio", 0)
|
||||||
num_images = mm_counts.get("image", 0)
|
num_images = mm_counts.get("image", 0)
|
||||||
|
|
||||||
target_width, target_height = \
|
|
||||||
self.info.get_image_size_with_most_features()
|
|
||||||
|
|
||||||
target_width, target_height = \
|
target_width, target_height = \
|
||||||
self.info.get_image_size_with_most_features()
|
self.info.get_image_size_with_most_features()
|
||||||
|
|
||||||
|
|||||||
@ -172,26 +172,9 @@ class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo,
|
|||||||
assert isinstance(feature_extractor, WhisperFeatureExtractor)
|
assert isinstance(feature_extractor, WhisperFeatureExtractor)
|
||||||
return feature_extractor
|
return feature_extractor
|
||||||
|
|
||||||
def get_max_audio_tokens(self) -> int:
|
|
||||||
hf_config = self.get_hf_config()
|
|
||||||
max_source_position = hf_config.audio_config.max_source_positions
|
|
||||||
output_lengths = (max_source_position - 2) // 2 + 1
|
|
||||||
return output_lengths
|
|
||||||
|
|
||||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||||
return {"audio": None, "image": None, "video": None}
|
return {"audio": None, "image": None, "video": None}
|
||||||
|
|
||||||
def get_mm_max_tokens_per_item(
|
|
||||||
self,
|
|
||||||
seq_len: int,
|
|
||||||
mm_counts: Mapping[str, int],
|
|
||||||
) -> Mapping[str, int]:
|
|
||||||
return {
|
|
||||||
"audio": self.get_max_audio_tokens(),
|
|
||||||
"image": self.get_max_image_tokens(),
|
|
||||||
"video": self.get_max_video_tokens(seq_len, mm_counts),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class Qwen2_5OmniThinkerDummyInputsBuilder(
|
class Qwen2_5OmniThinkerDummyInputsBuilder(
|
||||||
BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]):
|
BaseDummyInputsBuilder[Qwen2_5OmniThinkerProcessingInfo]):
|
||||||
@ -210,7 +193,6 @@ class Qwen2_5OmniThinkerDummyInputsBuilder(
|
|||||||
return (audio_token * num_audios + image_token * num_images +
|
return (audio_token * num_audios + image_token * num_images +
|
||||||
video_token * num_videos)
|
video_token * num_videos)
|
||||||
|
|
||||||
# TODO: @abstractmethod after transition
|
|
||||||
def get_dummy_mm_data(
|
def get_dummy_mm_data(
|
||||||
self,
|
self,
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user