mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 15:35:01 +08:00
421 lines
17 KiB
Python
421 lines
17 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
# Copyright 2024 The Qwen team.
|
|
# Copyright 2023 The vLLM team.
|
|
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
|
#
|
|
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
|
# and OPT implementations in this library. It has been modified from its
|
|
# original forms to accommodate minor architectural differences compared
|
|
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
|
|
from collections.abc import Iterable, Mapping, Sequence
|
|
from typing import Any, Optional, TypedDict, Union
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
from transformers import BatchFeature
|
|
from transformers.models.qwen2_audio import (Qwen2AudioConfig,
|
|
Qwen2AudioEncoder,
|
|
Qwen2AudioProcessor)
|
|
from transformers.models.whisper import WhisperFeatureExtractor
|
|
|
|
from vllm.config import VllmConfig
|
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
|
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
|
MultiModalKwargs)
|
|
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
|
|
MultiModalDataParser)
|
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
|
BaseProcessingInfo, PromptReplacement,
|
|
PromptUpdate, PromptUpdateDetails)
|
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
|
from vllm.sequence import IntermediateTensors
|
|
|
|
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
|
from .utils import (AutoWeightsLoader, init_vllm_registered_model,
|
|
maybe_prefix, merge_multimodal_embeddings)
|
|
|
|
|
|
# # === Audio Inputs === #
|
|
class Qwen2AudioInputs(TypedDict):
|
|
input_features: torch.Tensor
|
|
"""Shape: `(num_audios, num_mel_bins, 3000)`"""
|
|
|
|
feature_attention_mask: torch.Tensor
|
|
"""Shape: `(num_audios, 3000)`"""
|
|
|
|
|
|
# === Audio Encoder === #
|
|
|
|
|
|
class Qwen2AudioMultiModalProjector(nn.Module):
|
|
|
|
def __init__(self, audio_hidden_size: int, text_hidden_size: int):
|
|
super().__init__()
|
|
self.linear = nn.Linear(audio_hidden_size, text_hidden_size, bias=True)
|
|
|
|
def forward(self, audio_features):
|
|
hidden_states = self.linear(audio_features)
|
|
return hidden_states
|
|
|
|
|
|
# From Qwen2AudioEncoder._get_feat_extract_output_lengths
|
|
def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
|
|
feat_lengths = (input_lengths - 1) // 2 + 1
|
|
output_lengths = (feat_lengths - 2) // 2 + 1
|
|
return feat_lengths, output_lengths
|
|
|
|
|
|
class Qwen2AudioProcessingInfo(BaseProcessingInfo):
|
|
|
|
def get_hf_config(self):
|
|
return self.ctx.get_hf_config(Qwen2AudioConfig)
|
|
|
|
def get_hf_processor(
|
|
self,
|
|
*,
|
|
# Ignored in initialization
|
|
sampling_rate: Optional[int] = None,
|
|
**kwargs: object,
|
|
) -> Qwen2AudioProcessor:
|
|
return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs)
|
|
|
|
def get_feature_extractor(
|
|
self,
|
|
*,
|
|
# Ignored in initialization
|
|
sampling_rate: Optional[int] = None,
|
|
) -> WhisperFeatureExtractor:
|
|
hf_processor = self.get_hf_processor(sampling_rate=sampling_rate)
|
|
feature_extractor = hf_processor.feature_extractor # type: ignore
|
|
assert isinstance(feature_extractor, WhisperFeatureExtractor)
|
|
return feature_extractor
|
|
|
|
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
|
return {"audio": None}
|
|
|
|
|
|
class Qwen2AudioDummyInputsBuilder(
|
|
BaseDummyInputsBuilder[Qwen2AudioProcessingInfo]):
|
|
|
|
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
|
num_audios = mm_counts.get("audio", 0)
|
|
|
|
hf_processor = self.info.get_hf_processor()
|
|
audio_token = hf_processor.audio_token
|
|
|
|
return audio_token * num_audios
|
|
|
|
def get_dummy_mm_data(
|
|
self,
|
|
seq_len: int,
|
|
mm_counts: Mapping[str, int],
|
|
) -> MultiModalDataDict:
|
|
feature_extractor = self.info.get_feature_extractor()
|
|
|
|
sampling_rate = feature_extractor.sampling_rate
|
|
audio_len = feature_extractor.chunk_length * sampling_rate
|
|
num_audios = mm_counts.get("audio", 0)
|
|
|
|
return {
|
|
"audio":
|
|
self._get_dummy_audios(length=audio_len, num_audios=num_audios)
|
|
}
|
|
|
|
|
|
class Qwen2AudioMultiModalProcessor(
|
|
BaseMultiModalProcessor[Qwen2AudioProcessingInfo]):
|
|
|
|
def _get_data_parser(self) -> MultiModalDataParser:
|
|
feature_extractor = self.info.get_feature_extractor()
|
|
return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
|
|
|
|
def _call_hf_processor(
|
|
self,
|
|
prompt: str,
|
|
mm_data: Mapping[str, object],
|
|
mm_kwargs: Mapping[str, Any],
|
|
tok_kwargs: Mapping[str, object],
|
|
) -> BatchFeature:
|
|
# NOTE - we rename audios -> audio in mm data because transformers has
|
|
# deprecated audios for the qwen2audio processor and will remove
|
|
# support for it in transformers 4.54.
|
|
audios = mm_data.pop("audios", [])
|
|
if audios:
|
|
mm_data["audio"] = audios
|
|
|
|
# Text-only input not supported in composite processor
|
|
if not mm_data.get("audio", []):
|
|
prompt_ids = self.info.get_tokenizer().encode(prompt)
|
|
prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
|
|
return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
|
|
|
|
feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
|
|
mm_kwargs = dict(
|
|
**mm_kwargs,
|
|
sampling_rate=feature_extractor.sampling_rate,
|
|
)
|
|
|
|
return super()._call_hf_processor(
|
|
prompt=prompt,
|
|
mm_data=mm_data,
|
|
mm_kwargs=mm_kwargs,
|
|
tok_kwargs=tok_kwargs,
|
|
)
|
|
|
|
def _get_mm_fields_config(
|
|
self,
|
|
hf_inputs: BatchFeature,
|
|
hf_processor_mm_kwargs: Mapping[str, object],
|
|
) -> Mapping[str, MultiModalFieldConfig]:
|
|
return dict(
|
|
input_features=MultiModalFieldConfig.batched("audio"),
|
|
feature_attention_mask=MultiModalFieldConfig.batched("audio"),
|
|
)
|
|
|
|
def _get_prompt_updates(
|
|
self,
|
|
mm_items: MultiModalDataItems,
|
|
hf_processor_mm_kwargs: Mapping[str, object],
|
|
out_mm_kwargs: MultiModalKwargs,
|
|
) -> Sequence[PromptUpdate]:
|
|
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
|
tokenizer = self.info.get_tokenizer()
|
|
vocab = tokenizer.get_vocab()
|
|
|
|
# Use getattr with default to be compatible with transformers<4.48
|
|
audio_token = getattr(processor, "audio_token", "<|AUDIO|>")
|
|
audio_bos_token = getattr(processor, "audio_bos_token",
|
|
"<|audio_bos|>")
|
|
audio_eos_token = getattr(processor, "audio_eos_token",
|
|
"<|audio_eos|>")
|
|
|
|
audio_token_id = vocab[audio_token]
|
|
audio_bos_id = vocab[audio_bos_token]
|
|
audio_eos_id = vocab[audio_eos_token]
|
|
|
|
feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
|
|
if feature_attention_mask is None:
|
|
audio_output_lengths = []
|
|
else:
|
|
assert isinstance(feature_attention_mask, torch.Tensor)
|
|
_, audio_output_lens = _get_feat_extract_output_lengths(
|
|
feature_attention_mask.sum(-1))
|
|
|
|
audio_output_lengths = audio_output_lens.tolist()
|
|
|
|
def get_replacement_qwen2_audio(item_idx: int):
|
|
num_features = audio_output_lengths[item_idx]
|
|
if num_features == 0:
|
|
audios = mm_items.get_items("audio", AudioProcessorItems)
|
|
audio_len = audios.get_audio_length(item_idx)
|
|
|
|
raise ValueError(f"The audio (len={audio_len}) is too short "
|
|
"to be represented inside the model")
|
|
|
|
audio_tokens = [audio_token_id] * num_features
|
|
|
|
return PromptUpdateDetails.select_token_id(
|
|
[audio_bos_id] + audio_tokens + [audio_eos_id],
|
|
embed_token_id=audio_token_id,
|
|
)
|
|
|
|
return [
|
|
PromptReplacement(
|
|
modality="audio",
|
|
target=audio_token,
|
|
replacement=get_replacement_qwen2_audio,
|
|
)
|
|
]
|
|
|
|
|
|
@MULTIMODAL_REGISTRY.register_processor(
|
|
Qwen2AudioMultiModalProcessor,
|
|
info=Qwen2AudioProcessingInfo,
|
|
dummy_inputs=Qwen2AudioDummyInputsBuilder)
|
|
class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
|
|
SupportsPP):
|
|
|
|
@classmethod
|
|
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
|
|
if modality.startswith("audio"):
|
|
return f"Audio {i}: <|audio_bos|><|AUDIO|><|audio_eos|>"
|
|
|
|
raise ValueError("Only audio modality is supported")
|
|
|
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
|
super().__init__()
|
|
config = vllm_config.model_config.hf_config
|
|
quant_config = vllm_config.quant_config
|
|
multimodal_config = vllm_config.model_config.multimodal_config
|
|
self.config = config
|
|
self.multimodal_config = multimodal_config
|
|
|
|
self.audio_tower = Qwen2AudioEncoder(config.audio_config)
|
|
self.multi_modal_projector = Qwen2AudioMultiModalProjector(
|
|
config.audio_config.d_model, config.text_config.hidden_size)
|
|
|
|
self.quant_config = quant_config
|
|
|
|
self.language_model = init_vllm_registered_model(
|
|
vllm_config=vllm_config,
|
|
hf_config=config.text_config,
|
|
prefix=maybe_prefix(prefix, "language_model"),
|
|
architectures=["Qwen2ForCausalLM"],
|
|
)
|
|
|
|
self.make_empty_intermediate_tensors = (
|
|
self.language_model.make_empty_intermediate_tensors)
|
|
|
|
def _validate_and_reshape_mm_tensor(self, mm_input: object,
|
|
name: str) -> torch.Tensor:
|
|
if not isinstance(mm_input, (torch.Tensor, list)):
|
|
raise ValueError(f"Incorrect type of {name}. "
|
|
f"Got type: {type(mm_input)}")
|
|
if isinstance(mm_input, torch.Tensor):
|
|
return torch.concat(list(mm_input))
|
|
else:
|
|
return torch.concat(mm_input)
|
|
|
|
def _parse_and_validate_audio_input(
|
|
self, **kwargs: object) -> Optional[Qwen2AudioInputs]:
|
|
input_features = kwargs.pop('input_features', None)
|
|
feature_attention_mask = kwargs.pop('feature_attention_mask', None)
|
|
if input_features is None:
|
|
return None
|
|
input_features = self._validate_and_reshape_mm_tensor(
|
|
input_features, 'input_features')
|
|
feature_attention_mask = self._validate_and_reshape_mm_tensor(
|
|
feature_attention_mask, 'feature_attention_mask')
|
|
if not isinstance(input_features, (torch.Tensor, list)):
|
|
raise ValueError("Incorrect type of audio input features. "
|
|
f"Got type: {type(input_features)}")
|
|
return Qwen2AudioInputs(input_features=input_features,
|
|
feature_attention_mask=feature_attention_mask)
|
|
|
|
def _process_audio_input(self,
|
|
audio_input: Qwen2AudioInputs) -> torch.Tensor:
|
|
|
|
input_features = audio_input["input_features"]
|
|
feature_attention_mask = audio_input["feature_attention_mask"]
|
|
|
|
audio_feat_lengths, audio_output_lengths = (
|
|
self.audio_tower._get_feat_extract_output_lengths(
|
|
feature_attention_mask.sum(-1)))
|
|
|
|
batch_size, _, max_mel_seq_len = input_features.shape
|
|
max_seq_len = (max_mel_seq_len - 2) // 2 + 1
|
|
# Create a sequence tensor of shape (batch_size, max_seq_len)
|
|
seq_range = (torch.arange(
|
|
0,
|
|
max_seq_len,
|
|
dtype=audio_feat_lengths.dtype,
|
|
device=audio_feat_lengths.device).unsqueeze(0).expand(
|
|
batch_size, max_seq_len))
|
|
lengths_expand = audio_feat_lengths.unsqueeze(-1).expand(
|
|
batch_size, max_seq_len)
|
|
# Create mask
|
|
padding_mask = seq_range >= lengths_expand
|
|
|
|
audio_attention_mask_ = padding_mask.view(
|
|
batch_size, 1, 1, max_seq_len).expand(batch_size, 1, max_seq_len,
|
|
max_seq_len)
|
|
audio_attention_mask = audio_attention_mask_.to(
|
|
dtype=self.audio_tower.conv1.weight.dtype,
|
|
device=self.audio_tower.conv1.weight.device)
|
|
audio_attention_mask[audio_attention_mask_] = float("-inf")
|
|
|
|
audio_outputs = self.audio_tower(input_features,
|
|
attention_mask=audio_attention_mask)
|
|
selected_audio_feature = audio_outputs.last_hidden_state
|
|
audio_features = self.multi_modal_projector(selected_audio_feature)
|
|
num_audios, max_audio_tokens, embed_dim = audio_features.shape
|
|
audio_output_lengths = audio_output_lengths.unsqueeze(1)
|
|
audio_features_mask = torch.arange(max_audio_tokens).expand(
|
|
num_audios, max_audio_tokens).to(
|
|
audio_output_lengths.device) < audio_output_lengths
|
|
masked_audio_features = audio_features[audio_features_mask].view(
|
|
-1, embed_dim)
|
|
|
|
# Split to tuple of embeddings for individual audio input.
|
|
return torch.split(masked_audio_features,
|
|
audio_output_lengths.flatten().tolist())
|
|
|
|
def get_language_model(self) -> torch.nn.Module:
|
|
return self.language_model
|
|
|
|
def get_multimodal_embeddings(self,
|
|
**kwargs: object) -> MultiModalEmbeddings:
|
|
audio_input = self._parse_and_validate_audio_input(**kwargs)
|
|
if audio_input is None:
|
|
return []
|
|
masked_audio_features = self._process_audio_input(audio_input)
|
|
return masked_audio_features
|
|
|
|
def get_input_embeddings(
|
|
self,
|
|
input_ids: torch.Tensor,
|
|
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
|
|
) -> torch.Tensor:
|
|
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
|
if multimodal_embeddings is not None \
|
|
and len(multimodal_embeddings) != 0:
|
|
inputs_embeds = merge_multimodal_embeddings(
|
|
input_ids, inputs_embeds, multimodal_embeddings,
|
|
self.config.audio_token_index)
|
|
return inputs_embeds
|
|
|
|
def forward(
|
|
self,
|
|
input_ids: torch.Tensor,
|
|
positions: torch.Tensor,
|
|
intermediate_tensors: Optional[IntermediateTensors] = None,
|
|
inputs_embeds: Optional[torch.Tensor] = None,
|
|
**kwargs: object,
|
|
) -> Union[torch.Tensor, IntermediateTensors]:
|
|
|
|
if intermediate_tensors is not None:
|
|
inputs_embeds = None
|
|
|
|
# NOTE: In v1, inputs_embeds is always generated at model runner, this
|
|
# condition is for v0 compatibility.
|
|
elif inputs_embeds is None:
|
|
multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
|
|
inputs_embeds = self.get_input_embeddings(input_ids,
|
|
multimodal_embeddings)
|
|
input_ids = None
|
|
|
|
hidden_states = self.language_model.model(input_ids,
|
|
positions,
|
|
intermediate_tensors,
|
|
inputs_embeds=inputs_embeds)
|
|
return hidden_states
|
|
|
|
def compute_logits(
|
|
self,
|
|
hidden_states: torch.Tensor,
|
|
sampling_metadata: SamplingMetadata,
|
|
) -> Optional[torch.Tensor]:
|
|
return self.language_model.compute_logits(hidden_states,
|
|
sampling_metadata)
|
|
|
|
def load_weights(self, weights: Iterable[tuple[str,
|
|
torch.Tensor]]) -> set[str]:
|
|
loader = AutoWeightsLoader(self)
|
|
return loader.load_weights(weights)
|