diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 551f284a3609..f80ba3a7aa23 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -755,7 +755,7 @@ class FusedMoE(CustomOp): intermediate_size: Intermediate size of the experts params_dtype: Data type for the parameters. reduce_results: Whether to all all_reduce on the output of the layer - renomalize: Whether to renormalize the logits in the fused_moe kernel + renormalize: Whether to renormalize the logits in the fused_moe kernel quant_config: Quantization configure. enable_eplb: Whether to enable expert parallelism load balancer. """ diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index f14f13e2ade9..13c3ab4f06dd 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -420,9 +420,8 @@ def shuffle_weights( Args: *tensors: Variable number of torch.Tensor objects. - layout: A pair of integers specifying the - block sizes used to divide the tensors during shuffling. - Default is (16, 16). + layout: A pair of integers specifying the block sizes used to divide + the tensors during shuffling. Default is (16, 16). Returns: A Tuple of shuffled tensors. diff --git a/vllm/model_executor/layers/fused_moe/routing_simulator.py b/vllm/model_executor/layers/fused_moe/routing_simulator.py index c8b107f13cd0..8758a570b3c6 100644 --- a/vllm/model_executor/layers/fused_moe/routing_simulator.py +++ b/vllm/model_executor/layers/fused_moe/routing_simulator.py @@ -10,7 +10,7 @@ like uniform random routing. """ from abc import ABC, abstractmethod -from typing import Optional +from typing import Any, Optional import torch @@ -50,7 +50,9 @@ class DistributionBasedRouting(RoutingStrategy): distributions for testing different routing patterns. """ - def __init__(self, distribution: str = "uniform", **distribution_params): + def __init__(self, + distribution: str = "uniform", + **distribution_params: Any): """ Initialize distribution-based routing. @@ -244,7 +246,7 @@ class RoutingSimulator: cls._routing_strategies[name] = strategy @classmethod - def get_available_strategies(cls): + def get_available_strategies(cls) -> list[str]: """ Get list of available routing strategy names. diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py index 39bd34d351f6..d05c0c0d5473 100644 --- a/vllm/model_executor/layers/quantization/bitblas.py +++ b/vllm/model_executor/layers/quantization/bitblas.py @@ -202,7 +202,7 @@ class BitBLASLinearMethod(LinearMethodBase): output_size: int, params_dtype: torch.dtype, **extra_weight_attrs, - ): + ) -> None: """Creates quantized weights for use in linear operations. The function initializes and returns a dictionary containing quantized @@ -211,7 +211,7 @@ class BitBLASLinearMethod(LinearMethodBase): Args: input_size_per_partition: The size of the input partition. - output_size_per_partition: The size of the output partition. + output_partition_sizes: List of output partition sizes. input_size: The total size of the input (unused). output_size: The total size of the output (unused). params_dtype: @@ -222,9 +222,9 @@ class BitBLASLinearMethod(LinearMethodBase): scales ('scales'), and zeros ('zeros'). Raises: - ValueError: If `params_dtype` is not `torch.float16` or if the - input size per partition is not divisible by the group size in - `quant_config`. + ValueError: If `params_dtype` is not `torch.float16` or if the input + size per partition is not divisible by the group size + in `quant_config`. """ del input_size, output_size # Unused arguments. weight_loader = extra_weight_attrs["weight_loader"] diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py index d03074f86184..646229258648 100644 --- a/vllm/model_executor/layers/quantization/gptq_bitblas.py +++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py @@ -265,9 +265,9 @@ class GPTQBitBLASLinearMethod(LinearMethodBase): scales ('scales'), and zeros ('zeros'). Raises: - ValueError: If `params_dtype` is not `torch.float16` or - if the input size per partition is not divisible by the - group size in `quant_config`. + ValueError: If `params_dtype` is not `torch.float16` or if the input + size per partition is not divisible by the group size + in `quant_config`. """ if params_dtype != torch.float16: raise ValueError("Parameter data type must be torch.float16, " diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py index 4bcfcd04b3d8..f10d20999bee 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py @@ -46,11 +46,11 @@ def choose_mp_linear_kernel( performance. Args: - config (MPLinearLayerConfig): Description of the linear layer to be - implemented. + config (MPLinearLayerConfig): Description of the linear layer to be + implemented. compute_capability (Optional[int], optional): The compute capability of - the target device, if None uses `current_platform` to get the compute - capability. Defaults to None. + the target device, if None uses `current_platform` to get + the compute capability. Defaults to None. Raises: ValueError: If no kernel can implement the given config. diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index b5e4d727bf21..a2481375f4fb 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -7,7 +7,7 @@ #!/usr/bin/env python3 import abc import math -from typing import Literal, Optional +from typing import Any, Literal, Optional, Union import numpy as np import torch @@ -131,31 +131,31 @@ class ConformerEncoderLayer(nn.Module): def __init__( self, - d_model=512, - ext_pw_out_channel=0, - depthwise_seperable_out_channel=256, - depthwise_multiplier=1, - n_head=4, - d_ffn=2048, - ext_pw_kernel_size=1, - kernel_size=3, - dropout_rate=0.1, - causal=False, - batch_norm=False, - activation="relu", - chunk_se=0, - chunk_size=18, - conv_activation="relu", - conv_glu_type="sigmoid", - bias_in_glu=True, - linear_glu_in_convm=False, - attention_inner_dim=-1, - attention_glu_type="swish", - activation_checkpointing="", - export=False, - use_pt_scaled_dot_product_attention=False, + d_model: int = 512, + ext_pw_out_channel: int = 0, + depthwise_seperable_out_channel: int = 256, + depthwise_multiplier: int = 1, + n_head: int = 4, + d_ffn: int = 2048, + ext_pw_kernel_size: int = 1, + kernel_size: int = 3, + dropout_rate: float = 0.1, + causal: bool = False, + batch_norm: bool = False, + activation: str = "relu", + chunk_se: int = 0, + chunk_size: int = 18, + conv_activation: str = "relu", + conv_glu_type: str = "sigmoid", + bias_in_glu: bool = True, + linear_glu_in_convm: bool = False, + attention_inner_dim: int = -1, + attention_glu_type: str = "swish", + activation_checkpointing: str = "", + export: bool = False, + use_pt_scaled_dot_product_attention: bool = False, attn_group_sizes: int = 1, - ): + ) -> None: super().__init__() self.feed_forward_in = FeedForward( @@ -209,24 +209,21 @@ class ConformerEncoderLayer(nn.Module): def forward( self, - x, - pos_k, - pos_v, - mask, + x: torch.Tensor, + pos_k: torch.Tensor, + pos_v: torch.Tensor, + mask: torch.Tensor, relative_attention_bias: Optional[Tensor] = None, - ): + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ConformerEncoder forward. Args: - x: torch.Tensor - input feature of shape (batch, max_time_in, size) - pos_k: torch.Tensor - positional key embedding. - mask: torch.Tensor - mask for x (batch, max_time_in) - relative_attention_bias: Optional[torch.Tensor] - bias added to attention logits w.r.t. relative positions - (1, n_head, time1, time2) + x: input feature of shape (batch, max_time_in, size) + pos_k: positional key embedding. + pos_v: positional value embedding. + mask: mask for x (batch, max_time_in) + relative_attention_bias: bias added to attention logits w.r.t. + relative positions (1, n_head, time1, time2) """ x = x + 0.5 * self.feed_forward_in(x) norm_x = self.layer_norm_att(x) @@ -323,25 +320,25 @@ class TransformerEncoderBase(abc.ABC, nn.Module): def __init__( self, - input_size, - chunk_size, - left_chunk, - attention_dim=256, - attention_heads=4, - input_layer="nemo_conv", - cnn_out=-1, - cnn_layer_norm=False, - time_reduction=4, - dropout_rate=0.0, - padding_idx=-1, - relative_attention_bias_args=None, - positional_dropout_rate=0.0, - nemo_conv_settings=None, + input_size: int, + chunk_size: Union[int, list[int]], + left_chunk: Union[int, list[int]], + attention_dim: int = 256, + attention_heads: int = 4, + input_layer: str = "nemo_conv", + cnn_out: int = -1, + cnn_layer_norm: bool = False, + time_reduction: int = 4, + dropout_rate: float = 0.0, + padding_idx: int = -1, + relative_attention_bias_args: Optional[dict[str, Any]] = None, + positional_dropout_rate: float = 0.0, + nemo_conv_settings: Optional[dict[str, Any]] = None, conv2d_extra_padding: Literal["feat", "feat_time", "none", True] = "none", - attention_group_size=1, - encoder_embedding_config=None, - ): + attention_group_size: int = 1, + encoder_embedding_config: Optional[dict[str, Any]] = None, + ) -> None: super().__init__() self.input_size = input_size self.input_layer = input_layer @@ -399,7 +396,10 @@ class TransformerEncoderBase(abc.ABC, nn.Module): self.encoder_embedding = MeanVarianceNormLayer( self.encoder_embedding_config["input_size"]) - def compute_lens_change(self, feature_lens): + def compute_lens_change( + self, + feature_lens: Union[int, + torch.Tensor]) -> Union[int, torch.Tensor]: """feature_lens: int return updated feature lens. @@ -433,10 +433,14 @@ class TransformerEncoderBase(abc.ABC, nn.Module): return ceil_func(feature_lens / self.time_reduction) @abc.abstractmethod - def forward(self): + def forward(self) -> Any: """Abstract forward method implementation.""" - def _chunk_size_selection(self, chunk_size=None, left_chunk=None): + def _chunk_size_selection( + self, + chunk_size: Optional[Union[int, list[int]]] = None, + left_chunk: Optional[Union[int, + list[int]]] = None) -> tuple[int, int]: """If chunk size is a list, we will randomly select a chunk size.""" if chunk_size is None: @@ -463,7 +467,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module): return chunk_size_train_eff, left_chunk_train_eff - def _get_embed_class(self, embed): + def _get_embed_class(self, embed: nn.Module) -> nn.Module: # pylint: disable=protected-access is_embed_using_act_chkpt = isinstance(embed, CheckpointWrapper) is_embed_fsdp_wrapped = isinstance(embed, FullyShardedDataParallel) @@ -474,13 +478,17 @@ class TransformerEncoderBase(abc.ABC, nn.Module): embed_class = embed.module return embed_class - def _forward_embeddings_core(self, input_tensor, masks): + def _forward_embeddings_core( + self, input_tensor: torch.Tensor, + masks: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: embed_class = self._get_embed_class(self.embed) assert isinstance(embed_class, NemoConvSubsampling) input_tensor, masks = self.embed(input_tensor, masks) return input_tensor, masks - def _position_embedding(self, input_tensor): + def _position_embedding( + self, input_tensor: torch.Tensor + ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: pos_k = None pos_v = None if self.relative_attention_bias_layer is None: @@ -488,7 +496,9 @@ class TransformerEncoderBase(abc.ABC, nn.Module): input_tensor) # default to add abs sinusoid embedding return pos_k, pos_v - def _streaming_mask(self, seq_len, batch_size, chunk_size, left_chunk): + def _streaming_mask(self, seq_len: int, batch_size: int, + chunk_size: Union[int, list[int]], + left_chunk: Union[int, list[int]]) -> torch.Tensor: chunk_size_train_eff, left_chunk_train_eff = \ self._chunk_size_selection(chunk_size, left_chunk) @@ -502,11 +512,17 @@ class TransformerEncoderBase(abc.ABC, nn.Module): [batch_size, -1, -1])) return enc_streaming_mask - def forward_embeddings(self, - xs_pad, - masks, - chunk_size_nc=None, - left_chunk_nc=None): + def forward_embeddings( + self, + xs_pad: torch.Tensor, + masks: torch.Tensor, + chunk_size_nc: Optional[Union[int, list[int]]] = None, + left_chunk_nc: Optional[Union[int, list[int]]] = None + ) -> Union[tuple[torch.Tensor, Optional[torch.Tensor], + Optional[torch.Tensor], torch.Tensor, torch.Tensor], + tuple[torch.Tensor, Optional[torch.Tensor], + Optional[torch.Tensor], torch.Tensor, torch.Tensor, + torch.Tensor]]: """Forwarding the inputs through the top embedding layers Args: @@ -569,7 +585,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module): return input_tensor, pos_k, pos_v, hs_mask, masks return input_tensor, pos_k, pos_v, hs_mask, masks, hs_mask_nc - def get_offset(self): + def get_offset(self) -> int: """Returns offset used when retaining inputs for decoding. This is essentially, how many additional frames have to be added to @@ -605,8 +621,6 @@ class ConformerEncoder(TransformerEncoderBase): Some examples for the 2 cases: left_chunk = 6 left_chunk = [12, 9, 6, 3] - left_chunk: int - number of chunks used for masking in streaming mode. num_lang: int This parameter is used to store the number of languages in the lang_dict, only used for multiseed/multilingual models. @@ -751,46 +765,46 @@ class ConformerEncoder(TransformerEncoderBase): def __init__( # pylint: disable-all self, - input_size, - chunk_size, - left_chunk, - num_lang=None, - attention_dim=256, - attention_heads=4, - linear_units=2048, - num_blocks=6, - dropout_rate=0.1, - input_layer="nemo_conv", - causal=True, - batch_norm=False, - cnn_out=-1, - cnn_layer_norm=False, - ext_pw_out_channel=0, - ext_pw_kernel_size=1, - depthwise_seperable_out_channel=256, - depthwise_multiplier=1, - chunk_se=0, - kernel_size=3, - activation="relu", - conv_activation="relu", - conv_glu_type="sigmoid", - bias_in_glu=True, - linear_glu_in_convm=False, - attention_glu_type="swish", - export=False, - extra_layer_output_idx=-1, - extra_multi_layer_output_idxs=[], # noqa - activation_checkpointing="", - relative_attention_bias_args=None, - time_reduction=4, - use_pt_scaled_dot_product_attention=False, - nemo_conv_settings=None, + input_size: int, + chunk_size: Union[int, list[int]], + left_chunk: Union[int, list[int]], + num_lang: Optional[int] = None, + attention_dim: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + input_layer: str = "nemo_conv", + causal: bool = True, + batch_norm: bool = False, + cnn_out: int = -1, + cnn_layer_norm: bool = False, + ext_pw_out_channel: int = 0, + ext_pw_kernel_size: int = 1, + depthwise_seperable_out_channel: int = 256, + depthwise_multiplier: int = 1, + chunk_se: int = 0, + kernel_size: int = 3, + activation: str = "relu", + conv_activation: str = "relu", + conv_glu_type: str = "sigmoid", + bias_in_glu: bool = True, + linear_glu_in_convm: bool = False, + attention_glu_type: str = "swish", + export: bool = False, + extra_layer_output_idx: int = -1, + extra_multi_layer_output_idxs: list[int] = [], # noqa + activation_checkpointing: str = "", + relative_attention_bias_args: Optional[dict[str, Any]] = None, + time_reduction: int = 4, + use_pt_scaled_dot_product_attention: bool = False, + nemo_conv_settings: Optional[dict[str, Any]] = None, conv2d_extra_padding: Literal["feat", "feat_time", "none", True] = "none", - replication_pad_for_subsample_embedding=False, - attention_group_size=1, - encoder_embedding_config=None, - ): + replication_pad_for_subsample_embedding: bool = False, + attention_group_size: int = 1, + encoder_embedding_config: Optional[dict[str, Any]] = None, + ) -> None: super().__init__( input_size, chunk_size, @@ -852,11 +866,13 @@ class ConformerEncoder(TransformerEncoderBase): # the device and the needed dtype: self.register_buffer("dev_type", torch.zeros(()), persistent=False) - def init_relative_attention_bias(self, input_tensor): + def init_relative_attention_bias( + self, input_tensor: torch.Tensor) -> Optional[torch.Tensor]: if self.relative_attention_bias_layer: return self.relative_attention_bias_layer(input_tensor) - def calculate_hs_mask(self, xs_pad, device, mask): + def calculate_hs_mask(self, xs_pad: torch.Tensor, device: torch.device, + mask: Optional[torch.Tensor]) -> torch.Tensor: max_audio_length = xs_pad.shape[1] batch_size = xs_pad.shape[0] enc_streaming_mask = self._streaming_mask(max_audio_length, batch_size, @@ -877,7 +893,8 @@ class ConformerEncoder(TransformerEncoderBase): return pad_mask @torch.jit.ignore - def forward(self, xs_pad, masks): + def forward(self, xs_pad: torch.Tensor, + masks: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: """Conformer Forward function Args: @@ -997,7 +1014,12 @@ class WindowQformer(nn.Module): if normalize_before else None) self.window_size = window_size - def forward(self, audio_embed, mask, embed_len=None): + def forward( + self, + audio_embed: torch.Tensor, + mask: Optional[torch.Tensor], + embed_len: Optional[int] = None + ) -> tuple[torch.Tensor, Optional[int]]: """forward decoder""" # audio_embed: N x T x D => N x D x T @@ -1042,7 +1064,7 @@ class WindowQformer(nn.Module): class AudioEmbedding(nn.Module): """Image embedding.""" - def __init__(self, config: PretrainedConfig, **kwargs) -> None: + def __init__(self, config: PretrainedConfig, **kwargs: Any) -> None: super().__init__() self.config = config # n_embed or hidden_size for text LM @@ -1148,19 +1170,18 @@ class AudioEmbedding(nn.Module): self.input_embeds = None self.audio_embed_sizes = None - def set_audio_embeds(self, input_embeds: torch.FloatTensor) -> None: + def set_audio_embeds(self, input_embeds: torch.Tensor) -> None: self.input_embeds = input_embeds - def set_audio_embed_sizes(self, - audio_embed_sizes: torch.LongTensor) -> None: + def set_audio_embed_sizes(self, audio_embed_sizes: torch.Tensor) -> None: self.audio_embed_sizes = audio_embed_sizes def get_audio_features( self, - input_embeds: torch.FloatTensor, - audio_attention_mask: torch.Tensor = None, + input_embeds: torch.Tensor, + audio_attention_mask: Optional[torch.Tensor] = None, audio_projection_mode: str = "speech", - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ arguments: input_embeds: audio features (B, T, D) B: num audios in a sequence @@ -1214,10 +1235,10 @@ class AudioEmbedding(nn.Module): def forward( self, - audio_features: torch.FloatTensor, - audio_attention_mask: torch.Tensor = None, + audio_features: torch.Tensor, + audio_attention_mask: Optional[torch.Tensor] = None, audio_projection_mode: str = "speech", - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ arguments: audio_features: audio features (T, D) diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py index 59535503822d..6fbfca619a42 100644 --- a/vllm/model_executor/models/phi4mm_utils.py +++ b/vllm/model_executor/models/phi4mm_utils.py @@ -16,13 +16,13 @@ from torch import Tensor, nn class BlockBase(nn.Module): """Block abstract module""" - def __init__(self, input_size, output_size): + def __init__(self, input_size: int, output_size: int) -> None: super().__init__() self.input_size = input_size self.output_size = output_size -def get_activation(name="relu"): +def get_activation(name: str = "relu") -> torch.nn.Module: """Select an activation function by name Args: @@ -43,15 +43,18 @@ def get_activation(name="relu"): return nn.Identity() -def adaptive_enc_mask(x_len, chunk_start_idx, left_window=0, right_window=0): +def adaptive_enc_mask(x_len: int, + chunk_start_idx: list[int], + left_window: int = 0, + right_window: int = 0) -> torch.Tensor: """ The function is very important for Transformer Transducer Streaming mode Args: - xs_len (int): sequence length - chunk_start_idx (list): first idx of each chunk, such as [0,18,36,48]. + x_len: sequence length + chunk_start_idx: first idx of each chunk, such as [0,18,36,48]. It also supports adaptive chunk size [0,10,15,45] - left_window (int): how many left chunks can be seen - right_window (int): how many right chunks can be seen. It is used for + left_window: how many left chunks can be seen + right_window: how many right chunks can be seen. It is used for chunk overlap model. Returns: mask (torch.Tensor): a mask tensor for streaming model @@ -172,13 +175,13 @@ class GLUPointWiseConv(nn.Module): def __init__( self, - input_dim, - output_dim, - kernel_size, - glu_type="sigmoid", - bias_in_glu=True, - causal=False, - ): + input_dim: int, + output_dim: int, + kernel_size: int, + glu_type: str = "sigmoid", + bias_in_glu: bool = True, + causal: bool = False, + ) -> None: super().__init__() self.glu_type = glu_type @@ -216,11 +219,10 @@ class GLUPointWiseConv(nn.Module): self.b1 = nn.Parameter(torch.zeros(1, output_dim, 1)) self.b2 = nn.Parameter(torch.zeros(1, output_dim, 1)) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: """ Args: - x: torch.Tensor - input tensor + x: input tensor """ # to be consistent with GLULinear, we assume the input always has the # #channel (#dim) in the last dimension of the tensor, so need to @@ -272,12 +274,12 @@ class DepthWiseSeperableConv1d(nn.Module): def __init__( self, - input_dim, - depthwise_seperable_out_channel, - kernel_size, - depthwise_multiplier, - padding=0, - ): + input_dim: int, + depthwise_seperable_out_channel: int, + kernel_size: int, + depthwise_multiplier: int, + padding: int = 0, + ) -> None: super().__init__() self.dw_conv = nn.Conv1d( @@ -301,12 +303,11 @@ class DepthWiseSeperableConv1d(nn.Module): self.pw_conv = nn.Identity() self.depthwise_seperable_out_channel = depthwise_seperable_out_channel - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: """ Args: - x: torch.Tensor - input tensor + x: input tensor """ x = self.dw_conv(x) if self.depthwise_seperable_out_channel != 0: @@ -375,23 +376,23 @@ class ConvModule(nn.Module): def __init__( self, - input_dim, - ext_pw_out_channel, - depthwise_seperable_out_channel, - ext_pw_kernel_size, - kernel_size, - depthwise_multiplier, - dropout_rate, - causal=False, - batch_norm=False, - chunk_se=0, - chunk_size=18, - activation="relu", - glu_type="sigmoid", - bias_in_glu=True, - linear_glu_in_convm=False, - export=False, - ): + input_dim: int, + ext_pw_out_channel: int, + depthwise_seperable_out_channel: int, + ext_pw_kernel_size: int, + kernel_size: int, + depthwise_multiplier: int, + dropout_rate: float, + causal: bool = False, + batch_norm: bool = False, + chunk_se: int = 0, + chunk_size: int = 18, + activation: str = "relu", + glu_type: str = "sigmoid", + bias_in_glu: bool = True, + linear_glu_in_convm: bool = False, + export: bool = False, + ) -> None: super().__init__() self.layer_norm = nn.LayerNorm(input_dim) self.input_dim = input_dim @@ -437,7 +438,7 @@ class ConvModule(nn.Module): self.ln2 = nn.Linear(input_dim * depthwise_multiplier, input_dim) - def _add_ext_pw_layer(self): + def _add_ext_pw_layer(self) -> None: """ This function is an extension of __init__ function and dedicated to the convolution module creation @@ -497,12 +498,11 @@ class ConvModule(nn.Module): self.pw_conv_simplify_w = torch.nn.Parameter(torch.ones(3)) self.pw_conv_simplify_b = torch.nn.Parameter(torch.zeros(3)) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: """ConvModule Forward. Args: - x: torch.Tensor - input tensor. + x: input tensor. """ x = self.layer_norm(x) @@ -567,21 +567,20 @@ class GLULinear(nn.Module): def __init__( self, - input_dim, - output_dim, - glu_type="sigmoid", - bias_in_glu=True, - ): + input_dim: int, + output_dim: int, + glu_type: str = "sigmoid", + bias_in_glu: bool = True, + ) -> None: super().__init__() self.linear = nn.Linear(input_dim, output_dim * 2, bias_in_glu) self.glu_act = GLU(-1, glu_type) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: """GLULinear forward Args: - x: torch.Tensor - inpute tensor. + x: input tensor. """ x = self.linear(x) return self.glu_act(x) @@ -609,12 +608,12 @@ class FeedForward(nn.Module): def __init__( self, - d_model, - d_inner, - dropout_rate, - activation="sigmoid", - bias_in_glu=True, - ): + d_model: int, + d_inner: int, + dropout_rate: float, + activation: str = "sigmoid", + bias_in_glu: bool = True, + ) -> None: super().__init__() self.d_model = d_model self.d_inner = d_inner @@ -628,12 +627,11 @@ class FeedForward(nn.Module): nn.Dropout(dropout_rate), ) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: """FeedForward forward function. Args: - x: torch.Tensor - input tensor. + x: input tensor. """ out = self.net(self.layer_norm(x)) @@ -642,14 +640,14 @@ class FeedForward(nn.Module): #### positional encoding starts here def _pre_hook( - state_dict, - prefix, - local_metadata, - strict, - missing_keys, - unexpected_keys, - error_msgs, -): + state_dict: dict, + prefix: str, + local_metadata: dict, + strict: bool, + missing_keys: list[str], + unexpected_keys: list[str], + error_msgs: list[str], +) -> None: """Perform pre-hook in load_state_dict for backward compatibility. Note: @@ -708,10 +706,10 @@ class T5RelativeAttentionLogitBias(nn.Module): """ def __init__(self, - num_heads, - num_buckets=-1, - max_distance=1000, - symmetric=False): + num_heads: int, + num_buckets: int = -1, + max_distance: int = 1000, + symmetric: bool = False) -> None: super().__init__() self.num_heads = num_heads self.num_buckets = num_buckets @@ -727,7 +725,7 @@ class T5RelativeAttentionLogitBias(nn.Module): self.num_buckets *= 2 self.bias_values = nn.Embedding(self.num_buckets, self.num_heads) - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: # instantiate bias compatible with shape of x maxpos = x.size(1) context_position = torch.arange(maxpos, @@ -760,7 +758,7 @@ class T5RelativeAttentionLogitBias(nn.Module): return t5_rel_att_bias - def _bucket_relative_position(self, relative_position): + def _bucket_relative_position(self, relative_position: Tensor) -> Tensor: # this is a placeholder (isn't tested, likely buggy) using HuggingFace # implem as a reference this also needs to be extended to support # asymmetric +/- ve positions @@ -810,7 +808,10 @@ class AbsolutePositionalEncoding(nn.Module): """ - def __init__(self, d_model, dropout_rate, max_len=5000): + def __init__(self, + d_model: int, + dropout_rate: float, + max_len: int = 5000) -> None: """Construct an PositionalEncoding object.""" super().__init__() self.d_model = d_model @@ -820,11 +821,11 @@ class AbsolutePositionalEncoding(nn.Module): self.extend_pe(torch.tensor(0.0).expand(1, max_len)) self._register_load_state_dict_pre_hook(_pre_hook) - def extend_pe(self, x): + def extend_pe(self, x: torch.Tensor) -> None: """Reset the positional encodings. Args: - x: torch.Tensor + x: input tensor """ if self.pe is not None and self.pe.size(1) >= x.size(1): if self.pe.dtype != x.dtype or self.pe.device != x.device: @@ -840,15 +841,14 @@ class AbsolutePositionalEncoding(nn.Module): pe = pe.unsqueeze(0) self.pe = pe.to(device=x.device, dtype=x.dtype) - def forward(self, x: torch.Tensor): + def forward(self, x: torch.Tensor) -> torch.Tensor: """Add positional encoding. Args: - x: torch.Tensor - Input tensor. shape is (batch, time, ...) + x: Input tensor. shape is (batch, time, ...) Returns: - torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) + Encoded tensor. Its shape is (batch, time, ...) """ self.extend_pe(x) @@ -868,7 +868,7 @@ class MeanVarianceNormLayer(nn.Module): layer input size. """ - def __init__(self, input_size): + def __init__(self, input_size: int) -> None: super().__init__() self.input_size = input_size self.global_mean = nn.Parameter(torch.zeros(input_size)) @@ -878,8 +878,7 @@ class MeanVarianceNormLayer(nn.Module): """MeanVarianceNormLayer Forward Args: - input_: torch.Tensor - input tensor. + input_: input tensor. """ return (input_ - self.global_mean) * self.global_invstd @@ -949,7 +948,10 @@ class CausalConv1D(nn.Conv1d): dtype=dtype, ) - def update_cache(self, x, cache=None): + def update_cache( + self, + x: Tensor, + cache: Optional[Tensor] = None) -> tuple[Tensor, Optional[Tensor]]: if cache is None: new_x = F.pad(x, pad=(self._left_padding, self._right_padding)) next_cache = cache @@ -963,7 +965,11 @@ class CausalConv1D(nn.Conv1d): next_cache = next_cache[:, :, -cache.size(-1):] return new_x, next_cache - def forward(self, x, cache=None): + def forward( + self, + x: Tensor, + cache: Optional[Tensor] = None + ) -> Union[Tensor, tuple[Tensor, Optional[Tensor]]]: x, cache = self.update_cache(x, cache=cache) x = super().forward(x) if cache is None: @@ -1017,8 +1023,8 @@ class CausalConv2D(nn.Conv2d): def forward( self, - x, - ): + x: Tensor, + ) -> Tensor: x = F.pad( x, pad=(self._left_padding, self._right_padding, 0, 0), @@ -1062,16 +1068,16 @@ class NemoConvSubsampling(torch.nn.Module): """ def __init__( - self, - feat_in, - feat_out, - subsampling_factor=4, - subsampling="dw_striding", - conv_channels=256, - subsampling_conv_chunking_factor=1, - activation=nn.ReLU(), # noqa: B008 - is_causal=False, - ): + self, + feat_in: int, + feat_out: int, + subsampling_factor: int = 4, + subsampling: str = "dw_striding", + conv_channels: int = 256, + subsampling_conv_chunking_factor: int = 1, + activation: torch.nn.Module = nn.ReLU(), # noqa: B008 + is_causal: bool = False, + ) -> None: super().__init__() self._subsampling = subsampling self._conv_channels = conv_channels @@ -1328,28 +1334,25 @@ class NemoConvSubsampling(torch.nn.Module): self.conv = torch.nn.Sequential(*layers) - def get_sampling_frames(self): + def get_sampling_frames(self) -> list[int]: return [1, self.subsampling_factor] - def get_streaming_cache_size(self): + def get_streaming_cache_size(self) -> list[int]: return [0, self.subsampling_factor + 1] - def forward(self, x, mask): + def forward(self, x: Tensor, + mask: Optional[Tensor]) -> tuple[Tensor, Optional[Tensor]]: """ Forward method for NeMo subsampling. Args: - x[Batch, Time, Filters]: torch.Tensor - input tensor - x_mask: torch.Tensor - input mask + x: input tensor + mask: input mask Returns: - x: torch.Tensor - Resulting tensor from subsampling (B, T // + x: Resulting tensor from subsampling (B, T // time_reduction_factor, feat_out) - pad_mask: torch.Tensor - tensor of padded hidden state sequences (B, 1, T // + pad_mask: tensor of padded hidden state sequences (B, 1, T // time_reduction_factor) """ x = x.unsqueeze(1) if self.conv2d_subsampling else x.transpose(1, 2) @@ -1403,7 +1406,7 @@ class NemoConvSubsampling(torch.nn.Module): padding_length.size(0), -1) < padding_length.unsqueeze(1) return x, pad_mask.unsqueeze(1) - def reset_parameters(self): + def reset_parameters(self) -> None: # initialize weights if self._subsampling == "dw_striding": with torch.no_grad(): @@ -1433,7 +1436,7 @@ class NemoConvSubsampling(torch.nn.Module): torch.nn.init.uniform_(self.out.weight, -fc_scale, fc_scale) torch.nn.init.uniform_(self.out.bias, -fc_scale, fc_scale) - def conv_split_by_batch(self, x): + def conv_split_by_batch(self, x: Tensor) -> tuple[Tensor, bool]: """Tries to split input by batch, run conv and concat results""" b, _, _, _ = x.size() if b == 1: # can't split if batch size is 1 @@ -1460,7 +1463,7 @@ class NemoConvSubsampling(torch.nn.Module): True, ) - def conv_split_by_channel(self, x): + def conv_split_by_channel(self, x: Tensor) -> Tensor: """For dw convs, tries to split input by time, run conv and concat results""" x = self.conv[0](x) # full conv2D @@ -1500,7 +1503,8 @@ class NemoConvSubsampling(torch.nn.Module): x = self.conv[i * 3 + 4](x) # activation return x - def channel_chunked_conv(self, conv, chunk_size, x): + def channel_chunked_conv(self, conv: torch.nn.Module, chunk_size: int, + x: Tensor) -> Tensor: """Performs channel chunked convolution""" ind = 0 @@ -1541,7 +1545,7 @@ class NemoConvSubsampling(torch.nn.Module): return torch.cat(out_chunks, 1) def change_subsampling_conv_chunking_factor( - self, subsampling_conv_chunking_factor: int): + self, subsampling_conv_chunking_factor: int) -> None: if (subsampling_conv_chunking_factor != -1 and subsampling_conv_chunking_factor != 1 and subsampling_conv_chunking_factor % 2 != 0): @@ -1552,12 +1556,12 @@ class NemoConvSubsampling(torch.nn.Module): self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor -def calc_length(lengths, - all_paddings, - kernel_size, - stride, - ceil_mode, - repeat_num=1): +def calc_length(lengths: Tensor, + all_paddings: int, + kernel_size: int, + stride: int, + ceil_mode: bool, + repeat_num: int = 1) -> Tensor: """Calculates the output length of a Tensor passed through a convolution or max pooling layer""" add_pad: float = all_paddings - kernel_size @@ -1573,11 +1577,11 @@ def calc_length(lengths, class AttModule(nn.Module): """Attention abstraction module""" - def __init__(self): + def __init__(self) -> None: super().__init__() self.export_mode = False - def set_export(self, mode=True): + def set_export(self, mode: bool = True) -> None: """set the export mode""" self.export_mode = mode @@ -1591,14 +1595,10 @@ class AttModule(nn.Module): """AttModule forward Args: - x: torch.Tensor - input tensor. - memory: torch.Tensor, optional - memory tensor. - pos_emb: torch.Tensor, optional - positional encoder embedding. - att_mask: torch.Tensor, optional - attention mask tensor. + x: input tensor. + memory: memory tensor. + pos_emb: positional encoder embedding. + att_mask: attention mask tensor. """ return x, memory, pos_emb, att_mask @@ -1606,15 +1606,15 @@ class AttModule(nn.Module): class AttBlock(BlockBase, AttModule): """Attention Block module to support both Attention and Block module.""" - def memory_dims(self, max_len=False): + def memory_dims(self, max_len: bool = False) -> tuple[int, int]: """memory dimensions""" return (1, self.input_size) def masked_softmax( - scores, + scores: Tensor, mask: Optional[Tensor], -): +) -> Tensor: if mask is not None: mask = mask.unsqueeze(1).eq(0) # (batch, 1, time1, time2) scores = scores.masked_fill(mask, -torch.inf) @@ -1636,10 +1636,6 @@ class MultiHeadedAttention(nn.Module): input size features. dropout_rate: float dropout rate. - use_LN: bool - apply layer norm or not - dropout_at_output: bool - whether to apply dropout at output attention_inner_dim: int, optional the attention dimension used in the class, it can be different from the input dimension n_feat. @@ -1666,16 +1662,16 @@ class MultiHeadedAttention(nn.Module): def __init__( self, - n_head, - n_feat, - dropout_rate, - attention_inner_dim=-1, - glu_type="swish", - bias_in_glu=True, - use_pt_scaled_dot_product_attention=False, - n_value=-1, + n_head: int, + n_feat: int, + dropout_rate: float, + attention_inner_dim: int = -1, + glu_type: str = "swish", + bias_in_glu: bool = True, + use_pt_scaled_dot_product_attention: bool = False, + n_value: int = -1, group_size: int = 1, - ): + ) -> None: super().__init__() if n_value == -1: n_value = n_feat @@ -1718,28 +1714,22 @@ class MultiHeadedAttention(nn.Module): query: Tensor, key: Tensor, value: Tensor, - pos_k: Tensor, - pos_v: Tensor, + pos_k: Optional[Tensor], + pos_v: Optional[Tensor], mask: Optional[Tensor], relative_attention_bias: Optional[Tensor] = None, - ): + ) -> Tensor: """Compute 'Scaled Dot Product Attention'. Args: - query: torch.Tensor - query tensor (batch, time1, size) - key: torch.Tensor - key tensor (batch, time2, size) - value: torch.Tensor - value tensor (batch, time1, size) - pos_k: torch.Tensor - key tensor used for relative positional embedding. - pos_v: torch.Tensor - value tensor used for relative positional embedding. - mask: torch.Tensor - mask tensor (batch, time1, time2) - relative_attention_bias: torch.Tensor - bias added to attention logits w.r.t. relative positions + query: query tensor (batch, time1, size) + key: key tensor (batch, time2, size) + value: value tensor (batch, time1, size) + pos_k: key tensor used for relative positional embedding. + pos_v: value tensor used for relative positional embedding. + mask: mask tensor (batch, time1, time2) + relative_attention_bias: bias added to attention logits w.r.t. + relative positions (1, n_head, time1, time2) """ n_batch = query.size(0) @@ -1832,20 +1822,20 @@ class MultiSequential(torch.nn.Sequential): """Multi-input multi-output torch.nn.Sequential""" @torch.jit.ignore - def forward(self, *args): + def forward(self, *args) -> tuple: """Forward method implementation.""" for m in self: args = m(*args) return args -def get_offset(input_layer: str, time_reduction: int): +def get_offset(input_layer: str, time_reduction: int) -> int: """Get an offset. We will use the offset for determining #frames of a subsampled feature. Args: - input_layer (str): Type of an input layer - time_reduction (int): time reduction factor for downsampling a feature + input_layer: Type of an input layer + time_reduction: time reduction factor for downsampling a feature Returns: int: offset """ @@ -1858,13 +1848,14 @@ def get_offset(input_layer: str, time_reduction: int): return 0 -def unfold_tensor(xs_pad, max_seq_len): +def unfold_tensor(xs_pad: Tensor, max_seq_len: int) -> Tensor: """ For a given tensor with shape of (N, T, D), if sequence length T is longer than max_seq_len, this function unfold it to a (NT', max_seq_len, D) where T' is T // max_seq_len. Args: - xs_pad: N, T, D + xs_pad: input tensor with shape (N, T, D) + max_seq_len: maximum sequence length """ _, _, D = xs_pad.shape xs_pad = xs_pad.transpose(-1, -2) # convert to N, D, T diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index a052b2a486f6..020142b2c1c1 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1193,21 +1193,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, input_ids: Flattened (concatenated) input_ids corresponding to a batch. positions: Flattened (concatenated) position ids corresponding to a - batch. - **NOTE**: If mrope is enabled (default setting for Qwen2.5-VL - opensource models), the shape will be `(3, seq_len)`, + batch. **NOTE**: If mrope is enabled (default setting for + Qwen2.5-VL opensource models), the shape will be `(3, seq_len)`, otherwise it will be `(seq_len,). - pixel_values: Pixel values to be fed to a model. - `None` if no images are passed. - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. - `None` if no images are passed. - pixel_values_videos: Pixel values of videos to be fed to a model. - `None` if no videos are passed. - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. - `None` if no videos are passed. - second_per_grid_ts: Tensor `(num_videos)` of video time interval ( - in seconds) for each grid along the temporal dimension in the - 3D position IDs. `None` if no videos are passed. """ if intermediate_tensors is not None: diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index ed65944c109b..34b9c1ad07d7 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -9,7 +9,7 @@ model alternates between state space model layers and attention-based layers. """ from collections.abc import Iterable from itertools import cycle -from typing import Optional, Union +from typing import Any, Optional, Union import torch from torch import nn @@ -528,8 +528,6 @@ class Zamba2MambaDecoderLayer(nn.Module): hidden_states: Input tensor [batch_size, seq_len, hidden_size] mamba_cache_params: Parameters for Mamba's state caches (one for conv, one for ssm) - sequence_idx: Index tensor for identifying sequences in batch - Required for proper chunked processing in prefill transformer_hidden_states: Optional output from transformer path Added to input if provided (used in hybrid architecture) positions: Optional position IDs (unused in Mamba) @@ -591,8 +589,6 @@ class Zamba2HybridLayer(nn.Module): Args: shared_transformer: Transformer decoder layer for attention pathway - linear: Linear projection for transformer output before Mamba - mamba: Mamba decoder layer for state space pathway """ super().__init__() self.block_idx = block_idx @@ -630,8 +626,6 @@ class Zamba2HybridLayer(nn.Module): positions: Position IDs for positional embeddings mamba_cache_params: Parameters for Mamba's state caches (one for conv, one for ssm) - sequence_idx: Indices for identifying sequences in batch, - required for proper chunked processing in prefill Returns: Output tensor combining transformer and Mamba representations @@ -915,8 +909,8 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): prefix: Optional prefix for parameter names Raises: - AssertionError: If prefix caching is enabled (not supported by - Mamba) + AssertionError: If prefix caching is enabled + (not supported by Mamba) """ config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config @@ -971,7 +965,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): input_ids: torch.Tensor, positions: torch.Tensor, inputs_embeds: Optional[torch.Tensor] = None, - **kwargs) -> torch.Tensor: + **kwargs: Any) -> torch.Tensor: """Forward pass through the model. Args: @@ -1012,9 +1006,9 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid): return hidden_states - def copy_inputs_before_cuda_graphs(self, input_buffers: dict[str, - torch.Tensor], - **kwargs) -> dict[str, torch.Tensor]: + def copy_inputs_before_cuda_graphs( + self, input_buffers: dict[str, torch.Tensor], + **kwargs: Any) -> dict[str, torch.Tensor]: """Copy inputs before CUDA graph capture. Args: