diff --git a/pyproject.toml b/pyproject.toml index c3d0440f32b5b..9465f1e8f0596 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,7 +77,6 @@ exclude = [ "vllm/engine/**/*.py" = ["UP006", "UP035"] "vllm/executor/**/*.py" = ["UP006", "UP035"] "vllm/model_executor/model_loader/**/*.py" = ["UP006", "UP035"] -"vllm/model_executor/models/**/*.py" = ["UP006", "UP035"] "vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"] "vllm/spec_decode/**/*.py" = ["UP006", "UP035"] "vllm/worker/**/*.py" = ["UP006", "UP035"] diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index c518efdb54f89..94a4328564bbb 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only Snowflake Arctic model.""" -from typing import Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -458,8 +459,8 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -467,8 +468,8 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant): ("qkv_proj", "v_proj", "v"), ] - mlp_params_mapping: List[Tuple[str, str, int]] = [] - expert_params_mapping: List[Tuple[str, str, int]] = [] + mlp_params_mapping: list[tuple[str, str, int]] = [] + expert_params_mapping: list[tuple[str, str, int]] = [] num_layers = self.config.num_hidden_layers for layer in range(num_layers): @@ -497,7 +498,7 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant): ("ws", f"experts.{expert_id}.w3.weight", expert_id)) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() logger.info( "It will take ~10 minutes loading from the 16-bit weights. " diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 7c716efab8ef1..f74e13888c48e 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable, Mapping, Sequence -from typing import List, Optional, Set, Tuple, TypedDict, Union +from typing import Optional, TypedDict, Union import torch import torch.nn as nn @@ -66,8 +66,8 @@ class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant): # Identity layer self.post_layernorm = nn.Identity() - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -75,7 +75,7 @@ class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant): ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: # NOTE: post_layernorm is not used in Aria @@ -326,8 +326,8 @@ class AriaTextModel(LlamaModel, SupportsQuant): # Adapted from LlamaModel.load_weights with the modification of adding # the expert weights mapping to `stacked_params_mapping` - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -339,7 +339,7 @@ class AriaTextModel(LlamaModel, SupportsQuant): ("experts.w2_weight", "experts.fc2.weight", 'w2'), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -528,7 +528,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): self.vocab_size, logit_scale) def _validate_image_sizes( - self, images: List[torch.Tensor]) -> List[torch.Tensor]: + self, images: list[torch.Tensor]) -> list[torch.Tensor]: if not all(img.shape == images[0].shape for img in images): raise ValueError("All images must be the same size") return images @@ -578,7 +578,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): def _process_image_input( self, image_input: AriaImagePixelInputs - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: assert self.vision_tower is not None pixel_values = image_input['pixel_values'] @@ -651,6 +651,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index d152287e8fa39..08d49d71eca12 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 Adapted from # https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision -from typing import (Iterable, Literal, Mapping, Optional, Sequence, Set, Tuple, - TypedDict, Union, cast) +from collections.abc import Iterable, Mapping, Sequence +from typing import Literal, Optional, TypedDict, Union, cast import torch from torch import nn @@ -315,8 +315,8 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, def dtype(self): return next(self.parameters()).dtype - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 444ed38d05c01..077e36176430a 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -20,7 +20,8 @@ # limitations under the License. """Inference-only BaiChuan model compatible with HuggingFace weights.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -230,7 +231,7 @@ class BaiChuanDecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -320,15 +321,15 @@ class BaiChuanModel(nn.Module): hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -421,8 +422,8 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 87e1e102efd81..d6a705fb1859a 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only Bamba model.""" # Added by the IBM Team, 2024 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -355,8 +356,8 @@ class BambaModel(nn.Module): hidden_states, _ = self.final_layernorm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -367,7 +368,7 @@ class BambaModel(nn.Module): ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -495,7 +496,7 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = self.config.hidden_size @@ -535,7 +536,7 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index bcfbe92c3a11e..92bbe1bb67a3c 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -19,7 +19,8 @@ # limitations under the License. """PyTorch BART model.""" import math -from typing import Iterable, Optional, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -859,14 +860,14 @@ class BartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant): def _rename_stacked_param( self, name: str, - ) -> Tuple[str, Optional[str]]: + ) -> tuple[str, Optional[str]]: for key, mapping in self.stacked_params_mapping.items(): if key in name: name = name.replace(key, mapping["param_name"]) return name, mapping["shard_id"] return name, None - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): model_params_dict = dict(self.model.named_parameters()) top_params_dict = dict(self.named_parameters()) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 111b49ab8dd2a..0c6593bbe3a10 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -349,8 +350,8 @@ class BertModel(nn.Module, SupportsQuant): token_type_ids=token_type_ids) return self.encoder(hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "query", "q"), @@ -359,7 +360,7 @@ class BertModel(nn.Module, SupportsQuant): ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if self.pooler is None and "pooler" in name: continue @@ -424,7 +425,7 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant): ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) weights = ((name, data) for name, data in weights if not name.startswith("lm_head.")) @@ -472,7 +473,7 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, self._pooler = CrossEncodingPooler(config, self.classifier, self.bert.pooler) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): self_weights = [] diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 002949abff52a..af6deb3bf072e 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -208,7 +209,7 @@ class NomicRouter(nn.Module): def forward( self, x: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.LongTensor]: weights = self.layer(x.view(-1, x.shape[-1]))[0].softmax( dim=-1, dtype=torch.float32) top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1) @@ -428,8 +429,8 @@ class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant): token_type_ids=token_type_ids) return self.encoder(positions, hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: weights = self.hf_to_vllm_mapper.apply(weights) if self.config.hidden_act in ["silu", "geglu"]: @@ -442,7 +443,7 @@ class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant): stacked_params_mapping = [] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "pooler" in name: continue @@ -567,7 +568,7 @@ class GteNewModel(BertWithRope): } return config - def split_up_gate_proj(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def split_up_gate_proj(self, weights: Iterable[tuple[str, torch.Tensor]]): n = "mlp.up_gate_proj" for name, weight in weights: if n in name: @@ -578,14 +579,14 @@ class GteNewModel(BertWithRope): yield name, weight def ignore_unnecessary_layers(self, - weights: Iterable[Tuple[str, torch.Tensor]]): + weights: Iterable[tuple[str, torch.Tensor]]): for name, weight in weights: if name.startswith("classifier"): continue yield name, weight - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: weights = self.ignore_unnecessary_layers(weights) weights = self.split_up_gate_proj(weights) return super().load_weights(weights) @@ -664,7 +665,7 @@ class JinaRobertaModel(BertWithRope): token_type_ids=token_type_ids) @torch.inference_mode() - def jina_merge_lora_weights(self, weights: Iterable[Tuple[str, + def jina_merge_lora_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): # use for jina-embeddings-v3 # Merge Lora weights into a single weight tensor. @@ -707,7 +708,7 @@ class JinaRobertaModel(BertWithRope): return [(name, weight) for name, weight in weights.items()] - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: weights = self.jina_merge_lora_weights(weights) return super().load_weights(weights) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index f3d488926d09e..acbc5d04d7e35 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Minimal implementation of BlipVisionModel intended to be only used within a vision language model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -296,8 +297,8 @@ class BlipVisionModel(nn.Module, SupportsQuant): return self.post_layernorm(hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -305,7 +306,7 @@ class BlipVisionModel(nn.Module, SupportsQuant): ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index f44565bd2e01f..2ff7e394a4163 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -186,7 +186,7 @@ class Blip2QFormerAttention(nn.Module): self, hidden_states: torch.Tensor, encoder_hidden_states: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.Tensor]: + ) -> tuple[torch.Tensor]: self_output = self.attention( hidden_states, encoder_hidden_states=encoder_hidden_states, @@ -712,7 +712,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 74d401b295cef..eb1085d6b40d7 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -18,7 +18,8 @@ # limitations under the License. """Inference-only BLOOM model compatible with HuggingFace weights.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -322,10 +323,10 @@ class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only, SupportsQuant): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if name == "lm_head.weight": continue diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index ef8b033f3846d..a4528ca26d010 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -2,7 +2,7 @@ from collections.abc import Iterable, Mapping, Sequence from functools import cached_property -from typing import Any, Dict, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -229,7 +229,7 @@ class ChameleonAttention(nn.Module): num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 4096, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -292,7 +292,7 @@ class ChameleonAttention(nn.Module): prefix=f"{prefix}.attn") def _apply_qk_norm(self, q: torch.Tensor, - k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: # reshape for layernorm q = q.reshape(-1, self.num_heads, self.head_dim) k = k.reshape(-1, self.num_kv_heads, self.head_dim) @@ -367,7 +367,7 @@ class ChameleonDecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: if residual is None: residual = hidden_states @@ -438,7 +438,7 @@ class ChameleonSwinDecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: residual = hidden_states hidden_states = self.self_attn( @@ -773,7 +773,7 @@ class ChameleonVQVAE(nn.Module): def encode( self, pixel_values: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: hidden_states = self.encoder(pixel_values) hidden_states = self.quant_conv(hidden_states) quant, emb_loss, indices = self.quantize(hidden_states) @@ -786,7 +786,7 @@ class ChameleonImageVocabularyMapping: A class for mapping discrete image tokens from VQGAN to BPE tokens. """ - def __init__(self, vocab_map: Dict[str, int]): + def __init__(self, vocab_map: dict[str, int]): self.vocab_map = vocab_map self.image_token_id = vocab_map.get("") @@ -1052,8 +1052,8 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1063,7 +1063,7 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 233e9ee0a2583..4e95afe1a1474 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -3,7 +3,8 @@ # https://github.com/THUDM/ChatGLM2-6B """Inference-only ChatGLM model compatible with THUDM weights.""" import json -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -358,15 +359,15 @@ class ChatGLMModel(nn.Module, SupportsQuant): return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("linear_proj.merged_proj", "linear_proj.gate_proj", 0), ("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -440,7 +441,7 @@ class ChatGLMBaseModel(nn.Module): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 153054e5c028b..e8f3ae2156e02 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Minimal implementation of CLIPVisionModel intended to be only used within a vision language model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -368,8 +369,8 @@ class CLIPVisionModel(nn.Module, SupportsQuant): # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -377,7 +378,7 @@ class CLIPVisionModel(nn.Module, SupportsQuant): ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.vision_model.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 8f64e5d5c966c..546b5f932877d 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -21,7 +21,8 @@ # This file is based on the LLama model definition file in transformers """PyTorch Cohere model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -259,7 +260,7 @@ class CohereDecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states, residual = self.input_layernorm(hidden_states, residual) @@ -404,8 +405,8 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant): return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -415,7 +416,7 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: # Skip loading rotary embeddings since vLLM has its own diff --git a/vllm/model_executor/models/constant_size_cache.py b/vllm/model_executor/models/constant_size_cache.py index d073a7de69178..f1cc7e0f9e293 100644 --- a/vllm/model_executor/models/constant_size_cache.py +++ b/vllm/model_executor/models/constant_size_cache.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod -from typing import Any, Dict, List, Tuple +from typing import Any import torch @@ -16,7 +16,7 @@ class ConstantSizeCache(ABC): def __init__(self, max_batch_size: int): # Maps between the request id and a dict that maps between the seq_id # and its index inside the cache - self.cache_indices_mapping: Dict[str, Dict[int, int]] = {} + self.cache_indices_mapping: dict[str, dict[int, int]] = {} self.free_cache_indices = list(range(max_batch_size)) @property @@ -30,7 +30,7 @@ class ConstantSizeCache(ABC): """Copy cache data from one index to another""" pass - def current_run_tensors(self, **kwargs) -> Tuple: + def current_run_tensors(self, **kwargs) -> tuple: """ Return the tensors for the current run's conv and ssm state. """ @@ -117,8 +117,8 @@ class ConstantSizeCache(ABC): return self.cache_indices_mapping[cur_rid][seq_id] def _prepare_current_run_cache( - self, request_ids_to_seq_ids: Dict[str, list[int]], - finished_requests_ids: List[str]) -> List[int]: + self, request_ids_to_seq_ids: dict[str, list[int]], + finished_requests_ids: list[str]) -> list[int]: return [ self._assign_seq_id_to_cache_index(req_id, seq_id, finished_requests_ids) @@ -127,7 +127,7 @@ class ConstantSizeCache(ABC): ] def _release_finished_requests(self, - finished_seq_groups_req_ids: List[str]): + finished_seq_groups_req_ids: list[str]): for req_id in finished_seq_groups_req_ids: if req_id in self.cache_indices_mapping: for seq_id in self.cache_indices_mapping[req_id]: diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 850fba2604e17..e0b4712cdb47b 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -414,14 +415,14 @@ class DbrxForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: expert_params_mapping = [( "w13" if weight_name in ["w1", "v1"] else "w2", f"mlp.{weight_name}", ) for weight_name in ["w1", "v1", "w2"]] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index c6421143dd685..88d1ca9f7b833 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Deepseek model.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -184,7 +185,7 @@ class DeepseekAttention(nn.Module): num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -385,8 +386,8 @@ class DeepseekModel(nn.Module): hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -397,7 +398,7 @@ class DeepseekModel(nn.Module): ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -478,7 +479,7 @@ class DeepseekForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) \ No newline at end of file diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index b50175cf764fb..6d7b52aba5f91 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -176,8 +177,8 @@ class DeepSeekMTP(nn.Module): return self.model.compute_logits(hidden_states, sampling_metadata, spec_step_idx) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), @@ -190,7 +191,7 @@ class DeepSeekMTP(nn.Module): num_experts=self.config.n_routed_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 680b7e614dd6d..b78c193c1345a 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only DeepseekV2/DeepseekV3 model.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -200,7 +201,7 @@ class DeepseekV2Attention(nn.Module): q_lora_rank: int, kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -352,7 +353,7 @@ class DeepseekV2MLAAttention(nn.Module): q_lora_rank: Optional[int], kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -736,8 +737,8 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP): device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "gate_proj", 0), @@ -753,7 +754,7 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP): num_experts=self.config.n_routed_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 6d8f27530ceec..164fa40ffebe5 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -4,7 +4,7 @@ """Inference-only Deepseek-VL2 model compatible with HuggingFace weights.""" import math from collections.abc import Iterable, Mapping, Sequence -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -45,7 +45,7 @@ _IMAGE_TOKEN = "" class DeepseekVL2ImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, num_channels, height, width)` """ @@ -57,7 +57,7 @@ class DeepseekVL2ImagePixelInputs(TypedDict): class DeepseekVL2VImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. @@ -394,8 +394,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): return model def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.vision_config.image_size expected_dims = (3, h, w) @@ -415,8 +415,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): return data def _validate_images_spatial_crop( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: expected_dims = 2 def _validate_shape(d: torch.Tensor): @@ -640,8 +640,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) autoloaded_weights = loader.load_weights(weights, diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index 4ff1e785494f7..726660796a6fc 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -183,7 +184,7 @@ class EAGLE(nn.Module): return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): # This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B # due to missing lm_head weights and its config being that of a # Llama model. Here's a compatible version with the same weights: diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 4a6490cd127a5..4ffd06319684c 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -24,7 +24,8 @@ # limitations under the License. """Inference-only Exaone model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -102,7 +103,7 @@ class ExaoneAttention(nn.Module): num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -196,7 +197,7 @@ class ExaoneBlockAttention(nn.Module): num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -282,7 +283,7 @@ class ExaoneDecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -384,8 +385,8 @@ class ExaoneModel(nn.Module): hidden_states, _ = self.ln_f(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -395,7 +396,7 @@ class ExaoneModel(nn.Module): (".gate_up_proj", ".c_fc_1", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -535,8 +536,8 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, # With tie_word_embeddings, we can skip lm_head.weight diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py index 310aca999bc2d..00dbbebb120e8 100644 --- a/vllm/model_executor/models/fairseq2_llama.py +++ b/vllm/model_executor/models/fairseq2_llama.py @@ -16,7 +16,7 @@ # limitations under the License. """Llama model for fairseq2 weights.""" -from typing import Iterable, Set, Tuple +from collections.abc import Iterable import torch from torch.nn import Parameter @@ -44,8 +44,8 @@ class Fairseq2LlamaForCausalLM(LlamaForCausalLM): f"model.{self.tp_rank}.pt", ] - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: # fairseq2's serialization adds a wrapper to usual .pt state_dict's: # { "model_key": my_model_name, "my_model_name": state_dict } # which we first need to unpack @@ -102,7 +102,7 @@ class Fairseq2LlamaForCausalLM(LlamaForCausalLM): name: str, loaded_weight: torch.Tensor, params: dict[str, Parameter], - ) -> Tuple[str, torch.Tensor]: + ) -> tuple[str, torch.Tensor]: """Reshape fairseq2's weights.""" def permute(w: torch.Tensor, n_heads: int) -> torch.Tensor: diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index e7e03fc099723..376793594f8ba 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -20,7 +20,8 @@ """PyTorch Falcon model.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -394,8 +395,8 @@ class FalconModel(nn.Module): hidden_states = self.ln_f(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: total_num_heads = self.config.num_attention_heads if self.config.new_decoder_architecture: total_num_kv_heads = self.config.num_kv_heads @@ -405,7 +406,7 @@ class FalconModel(nn.Module): total_num_kv_heads = total_num_heads num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: @@ -498,8 +499,8 @@ class FalconForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index d1a36c3f481a1..f8acc56706d2b 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -3,7 +3,7 @@ import math from collections import OrderedDict from collections.abc import Iterable, Mapping, Sequence -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -713,8 +713,8 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -723,7 +723,7 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only): ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -922,8 +922,8 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal, 'Florence2 only supports COSINE as temporal embedding.') def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: size = self.processor_config["size"] h, w = size["height"], size["width"] @@ -944,12 +944,12 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal, return data def _parse_and_validate_image_input(self, **kwargs: object): - pixel_values: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + pixel_values: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "pixel_values", None) - image_embeds: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + image_embeds: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "image_embeds", None) @@ -1096,7 +1096,7 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index d6bd6155a447e..fbad7f56d0ba7 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -18,7 +18,7 @@ """ PyTorch Fuyu model.""" import math from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, Set, Tuple, TypedDict +from typing import Literal, Optional, TypedDict import torch import torch.nn as nn @@ -382,7 +382,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): self.language_model.lm_head, hidden_states, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index c1cc0df11178d..0f6d94e7518bb 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -15,8 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Gemma model compatible with HuggingFace weights.""" +from collections.abc import Iterable from functools import cache -from typing import Iterable, Optional, Set, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -231,7 +232,7 @@ class GemmaDecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -318,8 +319,8 @@ class GemmaModel(nn.Module): hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -329,7 +330,7 @@ class GemmaModel(nn.Module): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, shard_name, shard_id) in stacked_params_mapping: if shard_name not in name: @@ -413,8 +414,8 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 7fb2e9948c068..b46716213c626 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -15,7 +15,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -218,7 +219,7 @@ class Gemma2DecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -305,8 +306,8 @@ class Gemma2Model(nn.Module): hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -316,7 +317,7 @@ class Gemma2Model(nn.Module): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -413,8 +414,8 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 4e0d4f84ca6bd..3a88adcce0bdd 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -14,7 +14,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn.functional as F @@ -320,7 +321,7 @@ class Gemma3DecoderLayer(nn.Module): hidden_states: torch.Tensor, residual: Optional[torch.Tensor], **kwargs, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -412,8 +413,8 @@ class Gemma3Model(nn.Module): hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -423,7 +424,7 @@ class Gemma3Model(nn.Module): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -521,8 +522,8 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 65c177f8c5ade..743542ec8dfad 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import math from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Literal, Optional, Set, Tuple, TypedDict +from typing import Any, Literal, Optional, TypedDict import torch from torch import nn @@ -701,8 +701,8 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 290be968cb54b..f351ce5a06810 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GLM-4-0414 model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -60,7 +61,7 @@ class Glm4Attention(nn.Module): rope_theta: float = 10000, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - rope_scaling: Optional[Tuple] = None, + rope_scaling: Optional[tuple] = None, prefix: str = "", attn_type: str = AttentionType.DECODER) -> None: super().__init__() @@ -183,7 +184,7 @@ class Glm4DecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -293,8 +294,8 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index e3219333915e9..470a7053e1b65 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -18,7 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-2 model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -280,10 +281,10 @@ class GPT2LMHeadModel(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if ".attn.bias" in name or ".attn.masked_bias" in name: # Skip attention mask. diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index def6b1544d8c2..6a1d97bd7b69c 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -19,7 +19,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPTBigCode model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -243,10 +244,10 @@ class GPTBigCodeModel(nn.Module): hidden_states = self.ln_f(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if ".attn.bias" in name: # Skip attention mask. @@ -327,8 +328,8 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."]), diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 3db96fb8e187c..69fdd90cfbe8b 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -17,7 +17,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-J model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -228,8 +229,8 @@ class GPTJModel(nn.Module): hidden_states = self.ln_f(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -239,7 +240,7 @@ class GPTJModel(nn.Module): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "attn.bias" in name or "attn.masked_bias" in name: continue @@ -331,7 +332,7 @@ class GPTJForCausalLM(nn.Module, SupportsPP): sampling_metadata, self.lm_head.bias) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) \ No newline at end of file diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 620ee66f57e74..401fa9f5cc8bc 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -17,7 +17,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-NeoX model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -240,10 +241,10 @@ class GPTNeoXModel(nn.Module): hidden_states = self.final_layer_norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if ("attention.bias" in name or "attention.masked_bias" in name or "rotary_emb.inv_freq" in name): @@ -324,7 +325,7 @@ class GPTNeoXForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 0696a7245c224..eed0820a5779d 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only IBM Granite model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -97,7 +98,7 @@ class GraniteAttention(nn.Module): num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -230,7 +231,7 @@ class GraniteDecoderLayer(nn.Module): self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -321,8 +322,8 @@ class GraniteModel(nn.Module): hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -332,7 +333,7 @@ class GraniteModel(nn.Module): (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -475,8 +476,8 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: skip_prefixes = [ "rotary_emb.inv_freq", # Models trained using ColossalAI may include these tensors in diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index b43b59da6d111..512ec55177d84 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -23,7 +23,8 @@ # limitations under the License. """Inference-only IBM Granite speeech model.""" import math -from typing import Iterable, Mapping, Optional, Set, Tuple, TypedDict, Union +from collections.abc import Iterable, Mapping +from typing import Optional, TypedDict, Union import torch import torch.nn.functional as F @@ -763,8 +764,8 @@ class GraniteSpeechForConditionalGeneration( def load_weights( self, - weights: Iterable[Tuple[str, torch.Tensor]], - ) -> Set[str]: + weights: Iterable[tuple[str, torch.Tensor]], + ) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 7fff14cb9f120..f342dfff824f0 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GraniteMoe model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -305,8 +306,8 @@ class GraniteMoeModel(nn.Module): hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: new_weights = {} for n, p in weights: if n.endswith('.block_sparse_moe.input_linear.weight'): @@ -425,8 +426,8 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 706e648f1b4fd..443b102c99680 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only GraniteMoeHybrid model.""" # Added by the IBM Team, 2025 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -381,10 +382,10 @@ class GraniteMoeHybridModel(nn.Module): hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() def _load(n, p): param = params_dict[n] @@ -538,7 +539,7 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA, return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = self.config.hidden_size @@ -578,7 +579,7 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index 4e660cbf667b2..817e6091d276a 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -4,7 +4,8 @@ The architecture is the same as granitemoe but with the addition of shared experts. """ -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -208,8 +209,8 @@ class GraniteMoeSharedModel(nn.Module): hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: new_weights = {} for n, p in weights: if n.endswith('.block_sparse_moe.input_linear.weight'): @@ -329,8 +330,8 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP): device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 6f56eb2d5e382..6d2d16d098d40 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Grok1 model.""" -from typing import Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn.functional as F @@ -263,7 +264,7 @@ class Grok1DecoderLayer(nn.Module): kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -340,7 +341,7 @@ class Grok1Model(nn.Module): self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], inputs_embeds: Optional[torch.Tensor] = None, @@ -371,8 +372,8 @@ class Grok1Model(nn.Module): hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -390,7 +391,7 @@ class Grok1Model(nn.Module): num_experts=num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and @@ -528,7 +529,7 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -547,8 +548,8 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: skip_prefixes = ["rotary_emb.inv_freq"] # Skip lm_head when tie_word_embeddings is True if self.config.tie_word_embeddings: diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index cb0379c10f3a6..b8bdc7aa32b25 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -17,7 +17,8 @@ # limitations under the License. """PyTorch Idefics2 model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -342,8 +343,8 @@ class Idefics2VisionTransformer(nn.Module): last_hidden_state = self.post_layernorm(encoder_outputs) return last_hidden_state - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -351,7 +352,7 @@ class Idefics2VisionTransformer(nn.Module): ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 961954c2b584f..fdb128ef5b541 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -17,7 +17,7 @@ import math from collections.abc import Iterable, Mapping, Sequence -from typing import Dict, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch from torch import nn @@ -85,7 +85,7 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): def get_hf_processor( self, *, - size: Optional[Dict[str, int]] = None, + size: Optional[dict[str, int]] = None, **kwargs: object, ) -> Idefics3Processor: if size is not None: @@ -752,8 +752,8 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 7fea9647ead97..8f33a3e29c60b 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional, - Protocol, Type, Union, overload, runtime_checkable) +from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, + Union, overload, runtime_checkable) import torch from torch import Tensor @@ -102,7 +102,7 @@ class _SupportsMultiModalType(Protocol): @overload def supports_multimodal( - model: Type[object]) -> TypeIs[Type[SupportsMultiModal]]: + model: type[object]) -> TypeIs[type[SupportsMultiModal]]: ... @@ -112,8 +112,8 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: def supports_multimodal( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]: if isinstance(model, type): return isinstance(model, _SupportsMultiModalType) @@ -134,9 +134,9 @@ class SupportsLoRA(Protocol): """ # The `embedding_module` and `embedding_padding_modules` # are empty by default. - embedding_modules: ClassVar[Dict[str, str]] = {} - embedding_padding_modules: ClassVar[List[str]] = [] - packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {} + embedding_modules: ClassVar[dict[str, str]] = {} + embedding_padding_modules: ClassVar[list[str]] = [] + packed_modules_mapping: ClassVar[dict[str, list[str]]] = {} # We can't use runtime_checkable with ClassVar for issubclass checks @@ -145,13 +145,13 @@ class SupportsLoRA(Protocol): class _SupportsLoRAType(Protocol): supports_lora: Literal[True] - packed_modules_mapping: Dict[str, List[str]] - embedding_modules: Dict[str, str] - embedding_padding_modules: List[str] + packed_modules_mapping: dict[str, list[str]] + embedding_modules: dict[str, str] + embedding_padding_modules: list[str] @overload -def supports_lora(model: Type[object]) -> TypeIs[Type[SupportsLoRA]]: +def supports_lora(model: type[object]) -> TypeIs[type[SupportsLoRA]]: ... @@ -161,8 +161,8 @@ def supports_lora(model: object) -> TypeIs[SupportsLoRA]: def supports_lora( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsLoRA]], TypeIs[SupportsLoRA]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsLoRA]], TypeIs[SupportsLoRA]]: result = _supports_lora(model) if not result: @@ -191,7 +191,7 @@ def supports_lora( return result -def _supports_lora(model: Union[Type[object], object]) -> bool: +def _supports_lora(model: Union[type[object], object]) -> bool: if isinstance(model, type): return isinstance(model, _SupportsLoRAType) @@ -256,7 +256,7 @@ class _SupportsPPType(Protocol): @overload -def supports_pp(model: Type[object]) -> TypeIs[Type[SupportsPP]]: +def supports_pp(model: type[object]) -> TypeIs[type[SupportsPP]]: ... @@ -266,8 +266,8 @@ def supports_pp(model: object) -> TypeIs[SupportsPP]: def supports_pp( - model: Union[Type[object], object], -) -> Union[bool, TypeIs[Type[SupportsPP]], TypeIs[SupportsPP]]: + model: Union[type[object], object], +) -> Union[bool, TypeIs[type[SupportsPP]], TypeIs[SupportsPP]]: supports_attributes = _supports_pp_attributes(model) supports_inspect = _supports_pp_inspect(model) @@ -298,14 +298,14 @@ def supports_pp( return supports_attributes and supports_inspect -def _supports_pp_attributes(model: Union[Type[object], object]) -> bool: +def _supports_pp_attributes(model: Union[type[object], object]) -> bool: if isinstance(model, type): return isinstance(model, _SupportsPPType) return isinstance(model, SupportsPP) -def _supports_pp_inspect(model: Union[Type[object], object]) -> bool: +def _supports_pp_inspect(model: Union[type[object], object]) -> bool: model_forward = getattr(model, "forward", None) if not callable(model_forward): return False @@ -336,13 +336,13 @@ def has_inner_state(model: object) -> TypeIs[HasInnerState]: @overload -def has_inner_state(model: Type[object]) -> TypeIs[Type[HasInnerState]]: +def has_inner_state(model: type[object]) -> TypeIs[type[HasInnerState]]: ... def has_inner_state( - model: Union[Type[object], object] -) -> Union[TypeIs[Type[HasInnerState]], TypeIs[HasInnerState]]: + model: Union[type[object], object] +) -> Union[TypeIs[type[HasInnerState]], TypeIs[HasInnerState]]: if isinstance(model, type): return isinstance(model, _HasInnerStateType) @@ -373,13 +373,13 @@ def is_attention_free(model: object) -> TypeIs[IsAttentionFree]: @overload -def is_attention_free(model: Type[object]) -> TypeIs[Type[IsAttentionFree]]: +def is_attention_free(model: type[object]) -> TypeIs[type[IsAttentionFree]]: ... def is_attention_free( - model: Union[Type[object], object] -) -> Union[TypeIs[Type[IsAttentionFree]], TypeIs[IsAttentionFree]]: + model: Union[type[object], object] +) -> Union[TypeIs[type[IsAttentionFree]], TypeIs[IsAttentionFree]]: if isinstance(model, type): return isinstance(model, _IsAttentionFreeType) @@ -410,13 +410,13 @@ def is_hybrid(model: object) -> TypeIs[IsHybrid]: @overload -def is_hybrid(model: Type[object]) -> TypeIs[Type[IsHybrid]]: +def is_hybrid(model: type[object]) -> TypeIs[type[IsHybrid]]: ... def is_hybrid( - model: Union[Type[object], object] -) -> Union[TypeIs[Type[IsHybrid]], TypeIs[IsHybrid]]: + model: Union[type[object], object] +) -> Union[TypeIs[type[IsHybrid]], TypeIs[IsHybrid]]: if isinstance(model, type): return isinstance(model, _IsHybridType) @@ -439,13 +439,13 @@ def has_noops(model: object) -> TypeIs[HasNoOps]: @overload -def has_noops(model: Type[object]) -> TypeIs[Type[HasNoOps]]: +def has_noops(model: type[object]) -> TypeIs[type[HasNoOps]]: ... def has_noops( - model: Union[Type[object], object] -) -> Union[TypeIs[Type[HasNoOps]], TypeIs[HasNoOps]]: + model: Union[type[object], object] +) -> Union[TypeIs[type[HasNoOps]], TypeIs[HasNoOps]]: if isinstance(model, type): return isinstance(model, _HasNoOpsType) @@ -461,7 +461,7 @@ class SupportsCrossEncoding(Protocol): @overload def supports_cross_encoding( - model: Type[object]) -> TypeIs[Type[SupportsCrossEncoding]]: + model: type[object]) -> TypeIs[type[SupportsCrossEncoding]]: ... @@ -471,8 +471,8 @@ def supports_cross_encoding(model: object) -> TypeIs[SupportsCrossEncoding]: def _supports_cross_encoding( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: if isinstance(model, type): return isinstance(model, SupportsCrossEncoding) @@ -481,15 +481,15 @@ def _supports_cross_encoding( def supports_cross_encoding( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: return is_pooling_model(model) and _supports_cross_encoding(model) class SupportsQuant: """The interface required for all models that support quantization.""" - packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {} + packed_modules_mapping: ClassVar[dict[str, list[str]]] = {} quant_config: Optional[QuantizationConfig] = None def __new__(cls, *args, **kwargs) -> Self: @@ -525,7 +525,7 @@ class SupportsTranscription(Protocol): @overload def supports_transcription( - model: Type[object]) -> TypeIs[Type[SupportsTranscription]]: + model: type[object]) -> TypeIs[type[SupportsTranscription]]: ... @@ -535,8 +535,8 @@ def supports_transcription(model: object) -> TypeIs[SupportsTranscription]: def supports_transcription( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsTranscription]], TypeIs[SupportsTranscription]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsTranscription]], TypeIs[SupportsTranscription]]: if isinstance(model, type): return isinstance(model, SupportsTranscription) @@ -551,7 +551,7 @@ class SupportsV0Only(Protocol): @overload -def supports_v0_only(model: Type[object]) -> TypeIs[Type[SupportsV0Only]]: +def supports_v0_only(model: type[object]) -> TypeIs[type[SupportsV0Only]]: ... @@ -561,8 +561,8 @@ def supports_v0_only(model: object) -> TypeIs[SupportsV0Only]: def supports_v0_only( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsV0Only]], TypeIs[SupportsV0Only]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsV0Only]], TypeIs[SupportsV0Only]]: if isinstance(model, type): return isinstance(model, SupportsV0Only) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index f141dcf3cd4fc..d325a6b671328 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import (TYPE_CHECKING, Optional, Protocol, Type, Union, overload, +from typing import (TYPE_CHECKING, Optional, Protocol, Union, overload, runtime_checkable) import torch @@ -20,7 +20,7 @@ logger = init_logger(__name__) # The type of hidden states # Currently, T = torch.Tensor for all models except for Medusa -# which has T = List[torch.Tensor] +# which has T = list[torch.Tensor] T = TypeVar("T", default=torch.Tensor) T_co = TypeVar("T_co", default=torch.Tensor, covariant=True) @@ -48,12 +48,12 @@ class VllmModel(Protocol[T_co]): ... -def _check_vllm_model_init(model: Union[Type[object], object]) -> bool: +def _check_vllm_model_init(model: Union[type[object], object]) -> bool: model_init = model.__init__ return supports_kw(model_init, "vllm_config") -def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool: +def _check_vllm_model_forward(model: Union[type[object], object]) -> bool: model_forward = getattr(model, "forward", None) if not callable(model_forward): return False @@ -75,7 +75,7 @@ def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool: @overload -def is_vllm_model(model: Type[object]) -> TypeIs[Type[VllmModel]]: +def is_vllm_model(model: type[object]) -> TypeIs[type[VllmModel]]: ... @@ -85,8 +85,8 @@ def is_vllm_model(model: object) -> TypeIs[VllmModel]: def is_vllm_model( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[VllmModel]], TypeIs[VllmModel]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[VllmModel]], TypeIs[VllmModel]]: return _check_vllm_model_init(model) and _check_vllm_model_forward(model) @@ -105,7 +105,7 @@ class VllmModelForTextGeneration(VllmModel[T], Protocol[T]): @overload def is_text_generation_model( - model: Type[object]) -> TypeIs[Type[VllmModelForTextGeneration]]: + model: type[object]) -> TypeIs[type[VllmModelForTextGeneration]]: ... @@ -116,8 +116,8 @@ def is_text_generation_model( def is_text_generation_model( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[VllmModelForTextGeneration]], + model: Union[type[object], object], +) -> Union[TypeIs[type[VllmModelForTextGeneration]], TypeIs[VllmModelForTextGeneration]]: if not is_vllm_model(model): return False @@ -142,7 +142,7 @@ class VllmModelForPooling(VllmModel[T], Protocol[T]): @overload -def is_pooling_model(model: Type[object]) -> TypeIs[Type[VllmModelForPooling]]: +def is_pooling_model(model: type[object]) -> TypeIs[type[VllmModelForPooling]]: ... @@ -152,8 +152,8 @@ def is_pooling_model(model: object) -> TypeIs[VllmModelForPooling]: def is_pooling_model( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]: if not is_vllm_model(model): return False diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index fdcef8b9be8d2..d9d9002bd5baa 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -6,8 +6,9 @@ # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- +from collections.abc import Iterable from functools import partial -from typing import Iterable, Optional, Set, Tuple +from typing import Optional import torch import torch.nn as nn @@ -461,10 +462,10 @@ class InternVisionModel(nn.Module): return encoder_outputs - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: param = params_dict[name] weight_loader = getattr(param, "weight_loader", diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index c3d7cbfcddbb9..3f3e3966e838a 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Iterable from functools import partial -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Type, Union +from typing import Any, Optional, Union import torch from torch import nn @@ -81,7 +82,7 @@ class InternLM2Attention(nn.Module): num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -225,7 +226,7 @@ class InternLMDecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -252,7 +253,7 @@ class InternLM2Model(nn.Module): *, vllm_config: VllmConfig, prefix: str = "", - layer_type: Type[InternLMDecoderLayer] = InternLMDecoderLayer): + layer_type: type[InternLMDecoderLayer] = InternLMDecoderLayer): super().__init__() config = vllm_config.model_config.hf_config @@ -316,7 +317,7 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): *, vllm_config: VllmConfig, prefix: str = "", - model_type: Type[InternLM2Model] = InternLM2Model): + model_type: type[InternLM2Model] = InternLM2Model): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config @@ -361,15 +362,15 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "w1", 0), ("gate_up_proj", "w3", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -407,7 +408,7 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM): *, vllm_config: VllmConfig, prefix: str = "", - model_type: Type[InternLM2Model] = InternLM2Model, + model_type: type[InternLM2Model] = InternLM2Model, ): super().__init__(vllm_config=vllm_config, prefix=prefix, diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 69b0caab8f8ec..6893d0239121d 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -66,7 +66,7 @@ class InternLM2VEDecoderLayer(nn.Module): hidden_states: torch.Tensor, residual: Optional[torch.Tensor], visual_token_mask: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 23b92ad2bbf66..66e78fcc4e80c 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -8,7 +8,7 @@ # -------------------------------------------------------- from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union +from typing import Literal, Optional, TypedDict, TypeVar, Union import torch import torch.nn as nn @@ -932,8 +932,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: # unused modules appear in OpenGVLab/InternVideo2_5_Chat_8B skip_prefixes = [ "action_embed", "temporal_embed", "track_embed", diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index e1e3f0f199c5f..d6a1e0bb48454 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -21,7 +21,8 @@ """Inference-only Jais model compatible with HuggingFace weights.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -333,10 +334,10 @@ class JAISLMHeadModel(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "lm_head.weight" in name: # GPT-2 ties the weights of the embedding layer and the final diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 46335c2b3930f..6f9fa60c9b05e 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only Jamba model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -442,7 +443,7 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = self.config.hidden_size conv_state_shape = ( @@ -464,8 +465,8 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -482,7 +483,7 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -583,7 +584,7 @@ class JambaForSequenceClassification(JambaForCausalLM): logits = self.score(hidden_states) return self._pooler(logits, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): # TODO: The reward weights themselves have float32 accuracy data, we # would like to load them in fp32 to get that extra precision. super().load_weights(weights) diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index 0629266860fd3..b575f44765a89 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -43,10 +43,9 @@ import copy import math -from collections.abc import Mapping +from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass -from typing import (Any, Iterable, List, Literal, Optional, Sequence, Tuple, - TypedDict, Union) +from typing import Any, Literal, Optional, TypedDict, Union import torch from torch import nn @@ -120,7 +119,7 @@ class KimiVLMultiModalProjector(nn.Module): class KimiVLImagePixelInputs(TypedDict): type: Literal["pixel_values"] - pixel_values: Union[torch.Tensor, List[torch.Tensor]] + pixel_values: Union[torch.Tensor, list[torch.Tensor]] """ Shape:`(num_patches, num_channels, patch_size, patch_size)` """ @@ -447,7 +446,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal): sampling_metadata, **kwargs) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): config = self.config.text_config _KEYS_TO_MODIFY_MAPPING = { "language_model.lm_head": "lm_head", diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index c1593dcbe3444..c15c0213b520c 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only LLaMA model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -103,7 +104,7 @@ class LlamaAttention(nn.Module): num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -285,7 +286,7 @@ class LlamaDecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -394,8 +395,8 @@ class LlamaModel(nn.Module): return hidden_states, aux_hidden_states return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -405,7 +406,7 @@ class LlamaModel(nn.Module): (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -582,8 +583,8 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] @@ -599,7 +600,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self, name: str, loaded_weight: torch.Tensor, - ) -> Tuple[str, torch.Tensor]: + ) -> tuple[str, torch.Tensor]: def permute(w: torch.Tensor, n_heads: int): attn_in = self.config.head_dim * n_heads diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index dfd0804f21cf7..40fdd84d8fb08 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -16,7 +16,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only LLaMA model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Any, Optional import torch from torch import nn @@ -48,7 +49,7 @@ class Llama4MoE(nn.Module): gating_output: torch.Tensor, topk: int, renormalize: bool, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: router_scores, router_indices = fast_topk(gating_output, topk, dim=-1) # psuedo-standard is that the router scores are floats router_scores = torch.sigmoid(router_scores.float()) @@ -115,7 +116,7 @@ class Llama4Attention(nn.Module): num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -300,7 +301,7 @@ class Llama4DecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -335,9 +336,9 @@ class Llama4Model(LlamaModel): self, name: str, loaded_weight: torch.Tensor, - params_dict: Dict[str, nn.Parameter], - loaded_params: Set[str], - expert_params_mapping: List[Tuple[str, str, int, str]], + params_dict: dict[str, nn.Parameter], + loaded_params: set[str], + expert_params_mapping: list[tuple[str, str, int, str]], fused: bool = True, ) -> bool: expert_param_loaded = False @@ -390,8 +391,8 @@ class Llama4Model(LlamaModel): expert_param_loaded = True return expert_param_loaded - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -412,7 +413,7 @@ class Llama4Model(LlamaModel): ckpt_up_proj_name="gate_up_proj", num_experts=1) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "experts.gate_up_proj" in name or "experts.down_proj" in name: fused_experts_params = True @@ -489,8 +490,8 @@ class Llama4ForCausalLM(LlamaForCausalLM): prefix=prefix, layer_type=layer_type) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] @@ -506,7 +507,7 @@ class Llama4ForCausalLM(LlamaForCausalLM): self, name: str, loaded_weight: torch.Tensor, - ) -> Tuple[str, torch.Tensor]: + ) -> tuple[str, torch.Tensor]: def permute(w: torch.Tensor, n_heads: int): attn_in = self.config.head_dim * n_heads diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index 4e51daa220e46..018ecc2a8c0f0 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Set, Tuple +from collections.abc import Iterable import torch import torch.nn as nn @@ -92,8 +92,8 @@ class LlamaModel(nn.Module): hidden_states = hidden_states + residual return hidden_states, hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -103,7 +103,7 @@ class LlamaModel(nn.Module): (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: @@ -150,7 +150,7 @@ class EagleLlamaForCausalLM(LlamaForCausalLM): ) -> tuple[torch.Tensor, torch.Tensor]: return self.model(input_ids, positions, hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader( self, skip_prefixes=None, diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 9761c8389db21..2302d1352de64 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -56,7 +57,7 @@ class LlamaDecoderLayer(LlamaDecoderLayer): embeds: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: residual = hidden_states embeds = self.input_layernorm(embeds) @@ -140,8 +141,8 @@ class LlamaModel(nn.Module): hidden_states, hidden_prenorm = self.norm(hidden_states, residual) return hidden_states, hidden_prenorm - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -151,7 +152,7 @@ class LlamaModel(nn.Module): (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if 'midlayer.' in name: name = name.replace('midlayer.', 'layers.0.') @@ -228,7 +229,7 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): # combine multiple auxiliary hidden states returned by eagle3 return self.model.fc(hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader( self, skip_prefixes=None, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 6287fdb3300cd..95c1a0ca0b981 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -2,8 +2,8 @@ from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict, - TypeVar, Union, cast) +from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, + Union, cast) import torch import torch.nn as nn @@ -751,8 +751,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index c7e8d6991b25b..e731f1bfdb9ab 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 from abc import abstractmethod -from typing import (Final, Iterable, List, Literal, Mapping, Optional, - Protocol, Set, Tuple, TypedDict, TypeVar, Union) +from collections.abc import Iterable, Mapping +from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, + Union) import torch import torch.nn as nn @@ -266,8 +267,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, return data def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -450,7 +451,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, def _process_image_input( self, image_input: LlavaNextImageInputs, - ) -> Union[torch.Tensor, List[torch.Tensor]]: + ) -> Union[torch.Tensor, list[torch.Tensor]]: if image_input["type"] == "image_embeds": return [image_input["data"]] @@ -577,7 +578,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index a5ff189cfdb50..9303ea1217273 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -2,7 +2,7 @@ import math from collections.abc import Iterable, Mapping, Sequence -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -35,7 +35,7 @@ from .vision import get_vision_encoder_info class LlavaNextVideoPixelInputs(TypedDict): type: Literal["pixel_values_videos"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size, num_frames, num_channels, height, width)` @@ -300,8 +300,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, self.language_model.model.make_empty_intermediate_tensors) def _validate_video_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -326,7 +326,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, A legal video input should have the following dimensions: { "pixel_values_videos" : - List[b, Tensor(nb_frames, nb_channels, height, width)] + list[b, Tensor(nb_frames, nb_channels, height, width)] } """ pixel_values_videos = kwargs.pop("pixel_values_videos", None) @@ -460,8 +460,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, # This model doesn't support images for now diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 5c2b388e403df..49f1ecb4be897 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -2,8 +2,7 @@ import math from collections.abc import Iterable, Mapping, Sequence -from typing import (Final, List, Literal, Optional, Protocol, Set, Tuple, - TypedDict, Union) +from typing import Final, Literal, Optional, Protocol, TypedDict, Union import torch import torch.nn as nn @@ -471,8 +470,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, return data def _validate_image_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -530,8 +529,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, raise AssertionError("This line should be unreachable.") def _validate_video_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -557,7 +556,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, A legal video input should have the following dimensions: { "pixel_values_videos" : - List[b, Tensor(nb_frames, nb_channels, height, width)] + list[b, Tensor(nb_frames, nb_channels, height, width)] } """ pixel_values_videos = kwargs.pop("pixel_values_videos", None) @@ -706,7 +705,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, def _process_image_pixels( self, inputs: LlavaOnevisionImagePixelInputs, - ) -> Union[torch.Tensor, List[torch.Tensor]]: + ) -> Union[torch.Tensor, list[torch.Tensor]]: assert self.vision_tower is not None pixel_values = inputs["pixel_values"] @@ -735,7 +734,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, def _process_image_input( self, image_input: LlavaOnevisionImageInputs, - ) -> Union[torch.Tensor, List[torch.Tensor]]: + ) -> Union[torch.Tensor, list[torch.Tensor]]: if image_input["type"] == "image_embeds": return [image_input["data"]] @@ -948,7 +947,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index af78ece66bbed..ce76a76b65743 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """PyTorch MAMBA model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -30,7 +31,7 @@ from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = tuple[torch.Tensor, torch.Tensor] class MambaDecoderLayer(nn.Module): @@ -153,10 +154,10 @@ class MambaModel(nn.Module): return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "A_log" in name: name = name.replace("A_log", "A") @@ -247,7 +248,7 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP, return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() conv_state_shape = ( self.config.intermediate_size // world_size, @@ -265,7 +266,7 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index 72daf34c44121..858a1633befa0 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """PyTorch MAMBA2 model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -35,7 +36,7 @@ from .utils import (is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = tuple[torch.Tensor, torch.Tensor] class Mamba2DecoderLayer(nn.Module): @@ -241,7 +242,7 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree, return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() conv_state_shape, temporal_state_shape = None, None @@ -279,10 +280,10 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "A_log" in name: name = name.replace("A_log", "A") diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py index 25839727898fb..47d0ef9cc6bb1 100644 --- a/vllm/model_executor/models/mamba_cache.py +++ b/vllm/model_executor/models/mamba_cache.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Tuple import torch @@ -25,8 +24,8 @@ class MambaCacheParams: class MambaCacheManager(ConstantSizeCache): def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype, - num_mamba_layers: int, conv_state_shape: Tuple[int, int], - temporal_state_shape: Tuple[int, int]): + num_mamba_layers: int, conv_state_shape: tuple[int, int], + temporal_state_shape: tuple[int, int]): # Determine max batch size to set size of MambaCache max_batch_size = vllm_config.scheduler_config.max_num_seqs diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index a19d7da5654b6..ac0b281f359c3 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, List, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -96,13 +97,13 @@ class Medusa(nn.Module): # checkpoint file has token_map tensor. self.token_map = None - def forward(self, hidden_states: torch.Tensor) -> List[torch.Tensor]: + def forward(self, hidden_states: torch.Tensor) -> list[torch.Tensor]: return [block(hidden_states) for block in self.blocks] def compute_logits( - self, hidden_states: List[torch.Tensor], - sampling_metadata: SamplingMetadata) -> List[torch.Tensor]: - logits_lst: List[torch.Tensor] = [] + self, hidden_states: list[torch.Tensor], + sampling_metadata: SamplingMetadata) -> list[torch.Tensor]: + logits_lst: list[torch.Tensor] = [] for hs, lm_head in zip(hidden_states, self.lm_heads): _logits = self.logits_processor(lm_head, hs, sampling_metadata) @@ -127,9 +128,9 @@ class Medusa(nn.Module): def sample( self, - logits: List[torch.Tensor], + logits: list[torch.Tensor], sampling_metadata: SamplingMetadata, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: logits = torch.stack(logits, dim=0).float() logprobs = torch.log_softmax(logits, dim=-1) token_ids = logits.argmax(-1) # support only top-1 for now @@ -144,7 +145,7 @@ class Medusa(nn.Module): token_prob_list.append(probs[:, seq_group.sample_indices]) token_logprob_list.append(logprobs[:, seq_group.sample_indices]) - outputs: List[Optional[SamplerOutput]] = [] + outputs: list[Optional[SamplerOutput]] = [] for idx in range(len(sampling_metadata.seq_groups)): outputs.append( SamplerOutput( @@ -160,7 +161,7 @@ class Medusa(nn.Module): self, previous_hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: return self.sample( logits=self.compute_logits( hidden_states=self.forward(previous_hidden_states), @@ -169,10 +170,10 @@ class Medusa(nn.Module): sampling_metadata=sampling_metadata, ) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() weights_map = {} diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py index b882aeebb08dc..49ea64e029d63 100644 --- a/vllm/model_executor/models/mimo.py +++ b/vllm/model_executor/models/mimo.py @@ -24,7 +24,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only MiMo model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -87,8 +88,8 @@ class MiMoModel(Qwen2Model): hidden_states = hidden_states + residual return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), @@ -97,7 +98,7 @@ class MiMoModel(Qwen2Model): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "mtp_layers" in name: continue diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py index c2f1cf4112feb..adcfcaa6b1e6a 100644 --- a/vllm/model_executor/models/mimo_mtp.py +++ b/vllm/model_executor/models/mimo_mtp.py @@ -18,7 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only MiMo-MTP model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -193,8 +194,8 @@ class MiMoMTP(nn.Module): next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), @@ -204,7 +205,7 @@ class MiMoMTP(nn.Module): ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 866dc3f466e79..d99ae81468a9b 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -23,7 +23,8 @@ # limitations under the License. """Inference-only MiniCPM model compatible with HuggingFace weights.""" import math -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -190,7 +191,7 @@ class MiniCPMAttention(nn.Module): num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -329,7 +330,7 @@ class MiniCPMDecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -428,8 +429,8 @@ class MiniCPMModel(nn.Module): hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -446,7 +447,7 @@ class MiniCPMModel(nn.Module): for weight_name in ["w1", "w2", "w3"] ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -582,8 +583,8 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 1b24c38cef1b0..2a6867d12d993 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -23,7 +23,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only MiniCPM3 model compatible with HuggingFace weights.""" -from typing import Any, Dict, Optional +from typing import Any, Optional import torch from torch import nn @@ -58,7 +58,7 @@ class MiniCPM3Attention(nn.Module): q_lora_rank: int, kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index f42d48e919cd0..ae5df0f9273f6 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -23,8 +23,7 @@ # limitations under the License. """Inference-only MiniCPM-O model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping, Sequence -from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict, - Union) +from typing import Any, Callable, Literal, Optional, TypedDict, Union import torch from torch import nn @@ -559,8 +558,8 @@ class MiniCPMO(MiniCPMV2_6): self.audio_encoder_layer = -1 return model - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["tts"]) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 300360f785aec..04cc7e35e3450 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -26,8 +26,7 @@ import math from collections import defaultdict from collections.abc import Iterable, Mapping, Sequence from functools import partial -from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict, - Union) +from typing import Any, Callable, Literal, Optional, TypedDict, Union import numpy as np import torch @@ -118,7 +117,7 @@ class Resampler2_5(BaseResampler): num_heads: int, kv_dim: Optional[int] = None, norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, - max_size: Tuple[int, int] = (70, 70), + max_size: tuple[int, int] = (70, 70), quant_config: Optional[QuantizationConfig] = None, prefix: str = "") -> None: super().__init__(num_queries, @@ -133,7 +132,7 @@ class Resampler2_5(BaseResampler): self._set_2d_pos_cache(self.max_size) def _set_2d_pos_cache(self, - max_size: Tuple[int, int], + max_size: tuple[int, int], device: torch.types.Device = "cpu") -> None: pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim, max_size, @@ -203,7 +202,7 @@ class Resampler2_5(BaseResampler): return x -def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]: +def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]: version_float = getattr(config, "version", None) # The old configs do not include version number @@ -938,8 +937,8 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): ) -> Optional[torch.Tensor]: return self.llm.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 951f4e2304a1b..0285402dadf7f 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -3,7 +3,8 @@ import copy import math import re -from typing import Dict, Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.distributed @@ -127,7 +128,7 @@ class MiniMaxText01RMSNormTP(CustomOp): self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert residual is None, "RMSNorm does not support residual connection." return self._forward(x) @@ -178,7 +179,7 @@ class MiniMaxText01RotaryEmbedding(CustomOp): positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: from vllm import _custom_ops as ops self.cos_sin_cache = self.cos_sin_cache.to(positions.device) query_cast = query.to(self.cache_dtype) @@ -708,11 +709,11 @@ class MiniMaxText01DecoderLayer(nn.Module): def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor, - kv_caches: Union[List[Dict], Optional[torch.Tensor]], + kv_caches: Union[list[dict], Optional[torch.Tensor]], attn_metadata: AttentionMetadata, residual: Optional[torch.Tensor], is_warmup: bool = False, - **kwargs) -> Tuple[torch.Tensor, torch.Tensor]: + **kwargs) -> tuple[torch.Tensor, torch.Tensor]: forward_context = get_forward_context() attn_metadata = forward_context.attn_metadata @@ -1072,10 +1073,10 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid, device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() def which_layer(name: str) -> int: if "layers" in name: diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index 4ac60f97bb5f1..14c1250ca3b42 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable, Mapping -from typing import Literal, Optional, Set, Tuple, TypedDict, Union, cast +from typing import Literal, Optional, TypedDict, Union, cast import torch import torch.nn as nn @@ -357,7 +357,7 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 42ec786f3a590..2b9cbf10440ab 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -2,8 +2,8 @@ from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict, - TypeVar, Union) +from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, + Union) import torch import torch.nn as nn @@ -589,8 +589,8 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 1513c8dad097b..1968bf9e68af3 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Mixtral model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -314,8 +315,8 @@ class MixtralModel(nn.Module): hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -332,7 +333,7 @@ class MixtralModel(nn.Module): num_experts=self.config.num_local_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -479,7 +480,7 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"]) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 7c022a5b8f689..4de83d12be6a2 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Mixtral model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import numpy as np import torch @@ -397,8 +398,8 @@ class MixtralForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -407,7 +408,7 @@ class MixtralForCausalLM(nn.Module, SupportsPP): ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 0c1d61c01f910..713c9e8d203fa 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -16,7 +16,7 @@ """PyTorch Mllama model.""" import math from collections.abc import Iterable, Mapping, Sequence -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import numpy as np import torch @@ -224,7 +224,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo] return mm_inputs - def _get_num_image_in_last_group(self, prompt_token_ids: List[int]) -> int: + def _get_num_image_in_last_group(self, prompt_token_ids: list[int]) -> int: num_images = 0 for token_id in prompt_token_ids[::-1]: if token_id == self.info.get_hf_config().image_token_index: @@ -370,8 +370,8 @@ class ColumnParallelConv2dPatch(torch.nn.Module): self, in_channels: int, out_channels: int, - kernel_size: Union[int, Tuple[int, int]], - stride: Union[int, Tuple[int, int]], + kernel_size: Union[int, tuple[int, int]], + stride: Union[int, tuple[int, int]], bias: bool = False, ) -> None: super().__init__() @@ -603,7 +603,7 @@ class MllamaVisionEncoder(nn.Module): self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> Union[BaseModelOutput]: encoder_states = () for i, encoder_layer in enumerate(self.layers): @@ -878,7 +878,7 @@ class MllamaTextCrossAttention(nn.Module): self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor], - kv_range_for_decode: Optional[List[Tuple[int, int]]], + kv_range_for_decode: Optional[list[tuple[int, int]]], cross_attention_states: Optional[torch.Tensor], ) -> torch.Tensor: q, k, v = self.qkv_proj(hidden_states, cross_attention_states) @@ -905,7 +905,7 @@ class MllamaTextCrossAttention(nn.Module): k: torch.Tensor, v: torch.Tensor, attention_mask: torch.Tensor, - kv_range_for_decode: List[Tuple[int, int]], + kv_range_for_decode: list[tuple[int, int]], ) -> torch.Tensor: kv_cache = self.attn.kv_cache[self.pipeline_parallel_rank] attn_metadata: AttentionMetadata = get_forward_context().attn_metadata @@ -1019,7 +1019,7 @@ class MllamaCrossAttentionDecoderLayer(torch.nn.Module): hidden_states: torch.Tensor, cross_attention_states: torch.Tensor, cross_attention_mask: torch.Tensor, - kv_range_for_decode: Optional[List[Tuple[int, int]]], + kv_range_for_decode: Optional[list[tuple[int, int]]], full_text_row_masked_out_mask: torch.Tensor, ) -> torch.Tensor: residual = hidden_states @@ -1089,8 +1089,8 @@ class MllamaTextModel(nn.Module): positions: Optional[torch.LongTensor], cross_attention_states: Optional[torch.LongTensor], cross_attention_mask: Optional[torch.LongTensor], - kv_range_for_decode: Optional[List[Tuple[int, int]]], - full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, + kv_range_for_decode: Optional[list[tuple[int, int]]], + full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, torch.Tensor]], skip_cross_attention: bool, ) -> torch.Tensor: @@ -1150,8 +1150,8 @@ class MllamaForCausalLM(nn.Module): positions: Optional[torch.LongTensor], cross_attention_states: Optional[torch.LongTensor], cross_attention_mask: Optional[torch.LongTensor], - kv_range_for_decode: Optional[List[Tuple[int, int]]], - full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, + kv_range_for_decode: Optional[list[tuple[int, int]]], + full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, torch.Tensor]], skip_cross_attention: bool, ) -> torch.Tensor: @@ -1221,7 +1221,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal, return logits def unpack_data(self, - image_data: Union[List[torch.Tensor], torch.Tensor], + image_data: Union[list[torch.Tensor], torch.Tensor], padding_value=0) -> torch.Tensor: if isinstance(image_data, torch.Tensor): # torch.Tensor @@ -1230,7 +1230,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal, assert isinstance( image_data[0], torch.Tensor), "Image data is not properly batched." - # List[torch.Tensor] + # list[torch.Tensor] bsz = len(image_data) max_length = max(t.size(0) for t in image_data) trailing_dims = image_data[0].shape[1:] @@ -1248,24 +1248,24 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal, def _parse_and_validate_image_input(self, **kwargs: object): # tensor with the same shape will be batched together by # MultiModalKwargs.batch, so pixel_values here can be: - # - List[torch.Tensor]: + # - list[torch.Tensor]: # with shape (num_image, num_tiles, 3, image_res, image_res) # - torch.Tensor: # with shape (bs, num_image, num_tiles, 3, image_res, image_res) - pixel_values: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + pixel_values: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "pixel_values", None) - image_embeds: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + image_embeds: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "image_embeds", None) - aspect_ratio_ids: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + aspect_ratio_ids: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "aspect_ratio_ids", None) - aspect_ratio_mask: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + aspect_ratio_mask: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "aspect_ratio_mask", None) @@ -1293,10 +1293,10 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal, def _get_and_validate_encoder_lens( self, - encoder_seq_lens: List[int], - num_tiles: List[List[int]], + encoder_seq_lens: list[int], + num_tiles: list[list[int]], num_tokens_per_tile: int, - ) -> List[int]: + ) -> list[int]: # Get the actual number of encoder tokens for each sample. # Because attn_metadata.encoder_seq_lens only counts the last # group of images for each sample, which is used to cheat the @@ -1318,7 +1318,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal, def flat_encoder_result(self, cross_attention_states: torch.Tensor, attn_metadata: AttentionMetadata, - actual_encoder_seq_lens: List[int]): + actual_encoder_seq_lens: list[int]): cross_attention_states_flat = torch.zeros( sum(actual_encoder_seq_lens), @@ -1342,8 +1342,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal, self, image_inputs: MllamaImagePixelInputs, attn_metadata: AttentionMetadata, - actual_encoder_seq_lens: List[int], - ) -> Tuple[torch.Tensor]: + actual_encoder_seq_lens: list[int], + ) -> tuple[torch.Tensor]: # NOTE: llama's reference implementation runs vision model on CPU pixel_values = image_inputs['data'] aspect_ratio_ids = image_inputs['aspect_ratio_ids'] @@ -1367,10 +1367,10 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal, self, input_ids: torch.Tensor, attn_metadata: AttentionMetadata, - num_tiles: List[List[int]], + num_tiles: list[list[int]], num_tokens_per_tile: int, dtype: torch.dtype, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: token_ids = input_ids.tolist() start = 0 batch_token_ids = [] @@ -1422,7 +1422,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal, input_ids: torch.Tensor, positions: torch.Tensor, **kwargs: object, - ) -> Union[Tuple, CausalLMOutputWithPast]: + ) -> Union[CausalLMOutputWithPast]: attn_metadata = get_forward_context().attn_metadata if attn_metadata.num_prefill_tokens > 0 and \ attn_metadata.num_decode_tokens > 0: @@ -1476,8 +1476,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal, return outputs - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1487,7 +1487,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - updated_params: Set[str] = set() + updated_params: set[str] = set() for name, loaded_weight in weights: if 'patch_embedding.weight' in name: name = name.replace('patch_embedding.weight', @@ -1538,7 +1538,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal, tower_model="vision_model") -def skip_attention_mask(sparse_mask: List[List[int]]) -> bool: +def skip_attention_mask(sparse_mask: list[list[int]]) -> bool: for mask in sparse_mask: # Skip text-only samples. if len(mask) == 0: @@ -1556,10 +1556,10 @@ def skip_attention_mask(sparse_mask: List[List[int]]) -> bool: def convert_sparse_cross_attention_mask_to_dense( - sparse_mask: List[List[List[int]]], - num_tiles: List[List[int]], - lengths: List[int], -) -> Tuple[np.ndarray, List[Tuple[int, int]]]: + sparse_mask: list[list[list[int]]], + num_tiles: list[list[int]], + lengths: list[int], +) -> tuple[np.ndarray, list[tuple[int, int]]]: total_length = sum(lengths) total_tiles = sum([sum(tiles) for tiles in num_tiles]) dense_mask = np.zeros(shape=(total_length, total_tiles), dtype=np.int64) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 741b9837398c2..8c98492c0bedd 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -18,7 +18,7 @@ import math from collections.abc import Iterable, Mapping from itertools import tee -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch from torch import nn @@ -582,7 +582,7 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo] mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, - ) -> List[PromptUpdate]: + ) -> list[PromptUpdate]: assert ( mm_items.get_count("image", strict=False) == 0 or "aspect_ratios" in out_mm_kwargs @@ -778,26 +778,26 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, def separate_weights( self, - weights: Iterable[Tuple[str, torch.Tensor]], + weights: Iterable[tuple[str, torch.Tensor]], prefix: str, - ) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[ + ) -> tuple[Iterable[tuple[str, torch.Tensor]], Iterable[tuple[ str, torch.Tensor]]]: weights1, weights2 = tee(weights, 2) - def get_prefix_weights() -> Iterable[Tuple[str, torch.Tensor]]: + def get_prefix_weights() -> Iterable[tuple[str, torch.Tensor]]: for name, data in weights1: if name.startswith(prefix): yield (name, data) - def get_other_weights() -> Iterable[Tuple[str, torch.Tensor]]: + def get_other_weights() -> Iterable[tuple[str, torch.Tensor]]: for name, data in weights2: if not name.startswith(prefix): yield (name, data) return get_prefix_weights(), get_other_weights() - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) @@ -806,7 +806,7 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, (".self_attn.qkv_proj", ".self_attn.v_proj", "v"), ] params_dict = dict(self.named_parameters()) - updated_params: Set[str] = set() + updated_params: set[str] = set() # language_model is an Llama4ForCausalLM instance. We load it's # using llama4's load_weights routine. diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index 2920427f94f7b..a7d7aa7d44ef2 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import math -from typing import Iterable, List, Set, Tuple +from collections.abc import Iterable import torch import torch.nn as nn @@ -148,7 +148,7 @@ class MLPSpeculator(nn.Module): previous_hidden_states: torch.Tensor, num_predict_tokens: int, sampling_metadata: SamplingMetadata, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: if num_predict_tokens > self.max_speculative_tokens: raise ValueError(f"Max speculative tokens for model is " f"{self.max_speculative_tokens}, but " @@ -190,10 +190,10 @@ class MLPSpeculator(nn.Module): return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: name = name.replace("speculator.", "") param = params_dict.get(name) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 73effb207bcef..86552aa05bf95 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -212,11 +213,11 @@ class ModernBertModel(nn.Module): eps=config.norm_eps, bias=config.norm_bias) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: weights = self.hf_to_vllm_mapper.apply(weights) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if name.endswith(".bias") and name not in params_dict: continue @@ -280,7 +281,7 @@ class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding): self._pooler = CrossEncodingPooler(config, self.classifier, ModernBertPooler(config)) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): self_weights = [] diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py index 23814e6322d2e..25e6f594069ef 100644 --- a/vllm/model_executor/models/module_mapping.py +++ b/vllm/model_executor/models/module_mapping.py @@ -4,7 +4,7 @@ # https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py from dataclasses import dataclass, field -from typing import List, Union +from typing import Union @dataclass @@ -46,17 +46,17 @@ class ModelKeys: @dataclass class MultiModelKeys(ModelKeys): - language_model: List[str] = field(default_factory=list) - connector: List[str] = field(default_factory=list) + language_model: list[str] = field(default_factory=list) + connector: list[str] = field(default_factory=list) # vision tower and audio tower - tower_model: List[str] = field(default_factory=list) - generator: List[str] = field(default_factory=list) + tower_model: list[str] = field(default_factory=list) + generator: list[str] = field(default_factory=list) @staticmethod - def from_string_field(language_model: Union[str, List[str]] = None, - connector: Union[str, List[str]] = None, - tower_model: Union[str, List[str]] = None, - generator: Union[str, List[str]] = None, + def from_string_field(language_model: Union[str, list[str]] = None, + connector: Union[str, list[str]] = None, + tower_model: Union[str, list[str]] = None, + generator: Union[str, list[str]] = None, **kwargs) -> 'MultiModelKeys': def to_list(value): diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 42bbb77a22c07..e215582a37ac8 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -4,7 +4,7 @@ import math from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass from functools import cached_property, partial -from typing import List, Optional, Set, Tuple, TypedDict, Union +from typing import Optional, TypedDict, Union import numpy as np import torch @@ -90,7 +90,7 @@ class MolmoImageInputs(TypedDict): @dataclass class VisionBackboneConfig: - image_default_input_size: Tuple[int, int] = (336, 336) + image_default_input_size: tuple[int, int] = (336, 336) image_patch_size: int = 14 image_pos_patch_size: int = 14 image_emb_dim: int = 1024 @@ -267,7 +267,7 @@ class BlockCollection(nn.Module): for _ in range(config.image_num_layers) ]) - def forward(self, x: torch.Tensor) -> List[torch.Tensor]: + def forward(self, x: torch.Tensor) -> list[torch.Tensor]: hidden_states = [] for r in self.resblocks: x = r(x) @@ -334,7 +334,7 @@ class VisionTransformer(nn.Module): def forward(self, x: torch.Tensor, - patch_num: Optional[int] = None) -> List[torch.Tensor]: + patch_num: Optional[int] = None) -> list[torch.Tensor]: """ : param x: (batch_size, num_patch, n_pixels) """ @@ -434,7 +434,7 @@ class MolmoAttention(nn.Module): ) def _apply_qk_norm(self, q: torch.Tensor, - k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: if self.tp_size > 1: q = tensor_model_parallel_all_gather(q.contiguous()) k = tensor_model_parallel_all_gather(k.contiguous()) @@ -570,7 +570,7 @@ class MolmoDecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]: # Self Attention if residual is None: residual = hidden_states @@ -596,7 +596,7 @@ class MolmoDecoderNormAfterLayer(MolmoDecoderLayer): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]: # Self Attention residual = hidden_states hidden_states = self.self_attn( @@ -740,15 +740,15 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant): # image_features: (batch_size, num_image, num_patch, d_model) return image_features - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("merged_linear", "gate_proj", 0), ("merged_linear", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -855,10 +855,10 @@ class MolmoModel(nn.Module, SupportsQuant): hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if name.endswith(".bias") and name not in params_dict: @@ -1530,7 +1530,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) weights = _get_weights_with_merged_embedding(weights) @@ -1548,8 +1548,8 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, def _get_weights_with_merged_embedding( - weights: Iterable[Tuple[str, torch.Tensor]] -) -> Iterable[Tuple[str, torch.Tensor]]: + weights: Iterable[tuple[str, torch.Tensor]] +) -> Iterable[tuple[str, torch.Tensor]]: embedding_weights = {} for name, weight in weights: if "wte.embedding" in name: diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index c367d90f847b6..9f11d4a422733 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -42,9 +42,10 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import math +from collections.abc import Sequence from copy import deepcopy from functools import cached_property -from typing import List, Optional, Sequence, Tuple, Union +from typing import Optional, Union import torch import torch.nn as nn @@ -222,7 +223,7 @@ class MoonVisionPatchEmbed(nn.Module): self, out_dim: int, in_dim: int = 3, - patch_size: Union[int, Tuple[int, int]] = (14, 14), + patch_size: Union[int, tuple[int, int]] = (14, 14), pos_emb_height: int = 14, pos_emb_width: int = 14, ): @@ -526,7 +527,7 @@ def patch_merger( x: torch.Tensor, grid_hw: torch.Tensor, merge_kernel_size: list[int, int] = (2, 2), -) -> List[torch.Tensor]: +) -> list[torch.Tensor]: d_model = x.size(-1) outputs = [] diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 77bd794058cda..6c396d778ae71 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -2,7 +2,8 @@ # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -265,10 +266,10 @@ class MPTModel(nn.Module): hidden_states = self.norm_f(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: @@ -323,7 +324,7 @@ class MPTForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 5208c0796c8d2..862c53535e8a8 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Nemotron model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -69,7 +70,7 @@ def _cast_if_autocast_enabled(*args): class NemotronLayerNorm1P(nn.LayerNorm): def __init__(self, - normalized_shape: Union[int, List[int], torch.Size], + normalized_shape: Union[int, list[int], torch.Size], eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True, @@ -133,7 +134,7 @@ class NemotronAttention(nn.Module): num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -267,7 +268,7 @@ class NemotronDecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -441,8 +442,8 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -450,7 +451,7 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): (".qkv_proj", ".v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 988b994b76896..f4d5a77f2086d 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only deci model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Type, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -135,7 +136,7 @@ class DeciLMDecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if self._is_no_op_attention: @@ -168,7 +169,7 @@ class DeciModel(nn.Module): *, vllm_config: VllmConfig, prefix: str = "", - layer_type: Type[DeciLMDecoderLayer] = DeciLMDecoderLayer, + layer_type: type[DeciLMDecoderLayer] = DeciLMDecoderLayer, ): super().__init__() @@ -260,8 +261,8 @@ class DeciModel(nn.Module): hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -271,7 +272,7 @@ class DeciModel(nn.Module): (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -428,8 +429,8 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 0781ca168f840..a36b62cd2284c 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OLMo model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -209,7 +210,7 @@ class OlmoDecoderLayer(nn.Module): self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]: # Attention block. residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -338,8 +339,8 @@ class OlmoForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -349,7 +350,7 @@ class OlmoForCausalLM(nn.Module, SupportsPP): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 422b53d86f119..a41a959cdb044 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -23,8 +23,9 @@ # limitations under the License. """Inference-only OLMo2 model compatible with HuggingFace weights.""" +from collections.abc import Iterable from functools import partial -from typing import Iterable, Optional, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -135,7 +136,7 @@ class Olmo2Attention(nn.Module): ) def _apply_qk_norm(self, q: torch.Tensor, - k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: if self.tp_size > 1: q = tensor_model_parallel_all_gather(q.contiguous()) k = tensor_model_parallel_all_gather(k.contiguous()) @@ -365,7 +366,7 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index e6925e1256909..9a07f57fd999c 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OLMoE model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -102,7 +103,7 @@ class OlmoeAttention(nn.Module): num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 4096, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -307,8 +308,8 @@ class OlmoeModel(nn.Module): hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -327,7 +328,7 @@ class OlmoeModel(nn.Module): num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). @@ -439,8 +440,8 @@ class OlmoeForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=["rotary_emb.inv_freq"], diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index d258eddae25d4..8376d62410d4b 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -18,7 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OPT model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -312,8 +313,8 @@ class OPTModel(nn.Module): intermediate_tensors, inputs_embeds=inputs_embeds) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -321,7 +322,7 @@ class OPTModel(nn.Module): ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -400,8 +401,8 @@ class OPTForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head.weight"] diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 8d9c000750d78..1ccd1fe1f741d 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -5,7 +5,8 @@ # Copyright (c) OrionStar Inc. # LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE """Inference-only Orion-14B model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -72,7 +73,7 @@ class OrionAttention(nn.Module): num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -186,7 +187,7 @@ class OrionDecoderLayer(nn.Module): self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -259,8 +260,8 @@ class OrionModel(nn.Module): hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -270,7 +271,7 @@ class OrionModel(nn.Module): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -341,8 +342,8 @@ class OrionForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=([ diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 5204c751216f8..e03705d48f3e8 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -17,8 +17,8 @@ # limitations under the License. """ PyTorch Ovis model.""" import math -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from collections.abc import Iterable, Mapping +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -211,7 +211,7 @@ class OvisImagePatchInputs(TypedDict): `(batch_size * (num_patches + 1))` """ - patches_per_image: List[int] + patches_per_image: list[int] """ List of number of total patches for each image in the batch. This is used to restore the first two dimensions of `flat_data`. @@ -545,8 +545,8 @@ class Ovis(nn.Module, SupportsMultiModal): logits = self.llm.compute_logits(hidden_states, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 8699ae52622d5..427005e9b7041 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch from torch import nn @@ -391,7 +391,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index eacf02433b573..d46b95fea5a8a 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only persimmon model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -260,10 +261,10 @@ class PersimmonModel(nn.Module): hidden_states = self.final_layernorm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if is_pp_missing_parameter(name, self): continue @@ -336,7 +337,7 @@ class PersimmonForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index fc2b108bad97b..330ad5c59448b 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -36,7 +36,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """Inference-only Phi-1.5 model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -248,8 +249,8 @@ class PhiModel(nn.Module): return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -257,7 +258,7 @@ class PhiModel(nn.Module): ("qkv_proj", "v_proj", "v") ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: @@ -348,7 +349,7 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata, self.lm_head.bias) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index 338e87b4285fb..d00d7d886d671 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -230,8 +231,8 @@ class Phi3SmallSelfAttention(nn.Module): self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], - Optional[Tuple[torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], + Optional[tuple[torch.Tensor]]]: qkv, _ = self.query_key_value(hidden_states) qkv = qkv.view(qkv.shape[:-1] + @@ -352,10 +353,10 @@ class Phi3SmallModel(nn.Module): hidden_states = self.final_layernorm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if name.endswith(".bias") and name not in params_dict: continue @@ -454,8 +455,8 @@ class Phi3SmallForCausalLM(nn.Module, SupportsPP): output_hidden_states = output_hidden_states return output_hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head.weight"] diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index a1442251b9928..bb4d46be3f997 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -16,7 +16,7 @@ # limitations under the License. import re from collections.abc import Iterable, Mapping, Sequence -from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -94,7 +94,7 @@ def _init_img_processor(hf_config: PretrainedConfig, class Phi3VImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` @@ -113,7 +113,7 @@ class Phi3VImagePixelInputs(TypedDict): class Phi3VImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. @@ -571,8 +571,8 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, return data def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size expected_dims = (3, h, w) @@ -707,8 +707,8 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) autoloaded_weights = loader.load_weights(weights, diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index e5ff9ceddef72..fd154940ea7f4 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import math from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, TypedDict, Union import numpy as np import torch @@ -392,7 +392,7 @@ class Phi4MMImageEncoder(nn.Module): class Phi4MMImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` @@ -417,7 +417,7 @@ class Phi4MMImagePixelInputs(TypedDict): class Phi4MMImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. @@ -426,7 +426,7 @@ class Phi4MMImageEmbeddingInputs(TypedDict): class Phi4MMAudioFeatureInputs(TypedDict): type: Literal["audio_features"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size * num_audios, 80, M)""" @@ -1031,7 +1031,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): return audio_embeds def _parse_and_validate_image_input(self, - **kwargs: object) -> Optional[Dict]: + **kwargs: object) -> Optional[dict]: input_image_embeds: NestedTensors = kwargs.get("input_image_embeds") if input_image_embeds is None: return None @@ -1238,7 +1238,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> None: weights = ((name, data) for name, data in weights if "lora" not in name) diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index 34a7a73d057ae..609746b48588c 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -6,7 +6,7 @@ #!/usr/bin/env python3 import abc import math -from typing import List, Literal, Optional +from typing import Literal, Optional import numpy as np import torch @@ -746,7 +746,7 @@ class ConformerEncoder(TransformerEncoderBase): attention_group_size = attenion_heads = Multi-Query Attention """ - extra_multi_layer_output_idxs: List[int] + extra_multi_layer_output_idxs: list[int] def __init__( # pylint: disable-all self, diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py index 4051763cec8ca..f468fdbd5417f 100644 --- a/vllm/model_executor/models/phi4mm_utils.py +++ b/vllm/model_executor/models/phi4mm_utils.py @@ -5,7 +5,7 @@ # but implemented by the Phi-Speech team #!/usr/bin/env python3 import math -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch import torch.nn.functional as F @@ -1586,7 +1586,7 @@ class AttModule(nn.Module): memory: Optional[Tensor] = None, pos_emb: Optional[Tensor] = None, att_mask: Optional[Tensor] = None, - ) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]: + ) -> tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]: """AttModule forward Args: diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 2dc55e4c352e3..7f2e9fdf7c4ef 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only PhiMoE model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -505,8 +506,8 @@ class PhiMoEModel(nn.Module): hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -521,7 +522,7 @@ class PhiMoEModel(nn.Module): num_experts=self.config.num_local_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -657,8 +658,8 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["rotary_emb.inv_freq"]), diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index c0b492dbfcb9d..c664d2371e27c 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -4,7 +4,7 @@ import math from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass, fields from functools import cached_property -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -438,18 +438,18 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - def is_vision_encoder_weights(weight: Tuple[str, torch.Tensor]): + def is_vision_encoder_weights(weight: tuple[str, torch.Tensor]): return weight[0].startswith("vision_encoder") - def is_vision_lang_adapter_weights(weight: Tuple[str, torch.Tensor]): + def is_vision_lang_adapter_weights(weight: tuple[str, torch.Tensor]): return weight[0].startswith("vision_language_adapter") - def is_patch_merger(weight: Tuple[str, torch.Tensor]): + def is_patch_merger(weight: tuple[str, torch.Tensor]): return weight[0].startswith("patch_merger") - def is_pre_mm_projector_norm(weight: Tuple[str, torch.Tensor]): + def is_pre_mm_projector_norm(weight: tuple[str, torch.Tensor]): return weight[0].startswith("pre_mm_projector_norm") # Get references to parameters for direct loading @@ -566,7 +566,7 @@ def apply_rotary_emb_vit( xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) assert freqs_cis.dtype == torch.complex64 @@ -671,7 +671,7 @@ class Transformer(nn.Module): return x -def position_meshgrid(patch_embeds_list: List[torch.Tensor], ) -> torch.Tensor: +def position_meshgrid(patch_embeds_list: list[torch.Tensor], ) -> torch.Tensor: positions = torch.cat([ torch.stack( torch.meshgrid( @@ -733,7 +733,7 @@ class VisionTransformer(nn.Module): def forward( self, - images: List[torch.Tensor], + images: list[torch.Tensor], ) -> torch.Tensor: """ Args: @@ -1023,7 +1023,7 @@ class PixtralHFAttention(nn.Module): hidden_states: torch.Tensor, attention_mask: torch.Tensor, position_embeddings: torch.Tensor, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: batch, patches, _ = hidden_states.size() qkv_states, _ = self.qkv_proj(hidden_states) @@ -1249,8 +1249,8 @@ class PixtralHFVisionModel(nn.Module): # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1260,7 +1260,7 @@ class PixtralHFVisionModel(nn.Module): (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.transformer.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 790c48ccd2166..55a65f8078a4d 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only PLaMo2 model.""" import math -from typing import Iterable, Optional, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -659,7 +660,7 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, IsHybrid, return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = (self.config.mamba_num_heads * self.config.hidden_size_per_head) @@ -682,7 +683,7 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, IsHybrid, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters()) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index c10ef45440b11..40ac5e30a368b 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -16,7 +16,7 @@ # limitations under the License. """Inference-only IBM/NASA Prithvi Geospatial model.""" from collections.abc import Iterable, Mapping, Sequence -from typing import Optional, Set, Tuple, Union +from typing import Optional, Union import torch import torch.nn as nn @@ -154,7 +154,7 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal, "by PrithviGeospatialMAE.") def _parse_and_validate_multimodal_data( - self, **kwargs) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + self, **kwargs) -> tuple[torch.Tensor, Optional[torch.Tensor]]: pixel_values = kwargs.pop("pixel_values", None) if not isinstance(pixel_values, torch.Tensor): @@ -195,8 +195,8 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal, ) -> Optional[PoolerOutput]: return PoolerOutput([PoolingSequenceGroupOutput(hidden_states)]) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_list = [] model_buffers = dict(self.named_buffers()) loaded_buffers = [] diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index e75294bc6cba8..2fda87a4ff0f6 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -6,7 +6,8 @@ # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE """Inference-only QWen model compatible with HuggingFace weights.""" import json -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -76,7 +77,7 @@ class QWenAttention(nn.Module): num_heads: int, max_position_embeddings: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -166,7 +167,7 @@ class QWenBlock(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -284,15 +285,15 @@ class QWenBaseModel(nn.Module): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "w2", 0), ("gate_up_proj", "w1", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 60f8a7cd7270a..108d002e601b9 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -23,7 +23,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2 model compatible with HuggingFace weights.""" -from typing import Any, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -108,7 +109,7 @@ class Qwen2Attention(nn.Module): rope_theta: float = 10000, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - rope_scaling: Optional[Tuple] = None, + rope_scaling: Optional[tuple] = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: Optional[dict[str, Any]] = None, @@ -245,7 +246,7 @@ class Qwen2DecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -367,8 +368,8 @@ class Qwen2Model(nn.Module): hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -378,7 +379,7 @@ class Qwen2Model(nn.Module): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -490,8 +491,8 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] @@ -559,7 +560,7 @@ class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) weights = ((name, data) for name, data in weights if not name.startswith("lm_head.")) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index d8e178f9cd473..d89b822dd8739 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -21,10 +21,10 @@ # limitations under the License. """Inference-only Qwen2.5-Omni model (thinker part).""" +from collections.abc import Iterable, Mapping, Sequence from copy import copy from functools import partial -from typing import (Any, Dict, Iterable, List, Mapping, Optional, Sequence, - Set, Tuple, Union) +from typing import Any, Optional, Union import torch import torch.nn as nn @@ -138,7 +138,7 @@ class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo, min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, size: Optional[dict[str, int]] = None, - fps: Optional[Union[float, List[float]]] = None, + fps: Optional[Union[float, list[float]]] = None, **kwargs: object, ) -> Qwen2_5OmniProcessor: if fps is not None: @@ -550,7 +550,7 @@ class Qwen2_5OmniConditionalGenerationMixin: def _parse_and_validate_image_input( self, - **kwargs: Dict[str, Any], + **kwargs: dict[str, Any], ) -> Optional[Qwen2_5_VLImageInputs]: pixel_values = kwargs.pop("pixel_values", None) image_embeds = kwargs.pop("image_embeds", None) @@ -589,7 +589,7 @@ class Qwen2_5OmniConditionalGenerationMixin: def _parse_and_validate_video_input( self, - **kwargs: Dict[str, Any], + **kwargs: dict[str, Any], ) -> Optional[Qwen2_5_VLVideoInputs]: pixel_values_videos = kwargs.pop("pixel_values_videos", None) video_embeds = kwargs.pop("video_embeds", None) @@ -627,7 +627,7 @@ class Qwen2_5OmniConditionalGenerationMixin: def _process_audio_input( self, audio_input: Qwen2AudioInputs, - audio_hashes: List[str] = None, + audio_hashes: list[str] = None, cached_audio_features: torch.Tensor = None, ) -> torch.Tensor: @@ -676,7 +676,7 @@ class Qwen2_5OmniConditionalGenerationMixin: def _process_video_input( self, video_input: Qwen2_5_VLVideoInputs, - video_hashes: List[str] = None, + video_hashes: list[str] = None, cached_video_embeds: torch.Tensor = None) -> torch.Tensor: if video_input["type"] == "video_embeds": return video_input["video_embeds"].type(self.visual.dtype) @@ -825,7 +825,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration( if audio_input is None and image_input is None and video_input is None: return None - multimodal_embeddings: List[Tuple[NestedTensors, str]] = [] + multimodal_embeddings: list[tuple[NestedTensors, str]] = [] if audio_input is not None: audio_embeds = self._process_audio_input(audio_input) @@ -891,8 +891,8 @@ class Qwen2_5OmniThinkerForConditionalGeneration( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=["talker.", "token2wav."], diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8728de95134d7..5904ad1f1f247 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -24,9 +24,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2.5-VL model compatible with HuggingFace weights.""" +from collections.abc import Iterable, Mapping from functools import partial -from typing import (Callable, Iterable, List, Literal, Mapping, Optional, Set, - Tuple, TypedDict, Union) +from typing import Callable, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -91,7 +91,7 @@ class Qwen2_5_VLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] image_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all images' features. + - list[`torch.Tensor`]: A list of tensors holding all images' features. Each tensor holds an image's features. - `torch.Tensor`: A tensor holding all images' features (concatenation of all images' feature tensors). @@ -137,7 +137,7 @@ class Qwen2_5_VLVideoEmbeddingInputs(TypedDict): type: Literal["video_embeds"] video_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all videos' features. + - list[`torch.Tensor`]: A list of tensors holding all videos' features. Each tensor holds an video's features. - `torch.Tensor`: A tensor holding all videos' features (concatenation of all videos' feature tensors). @@ -709,8 +709,8 @@ class Qwen2_5_VisionTransformer(nn.Module): hidden_states = hidden_states[reverse_indices, :] return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("attn.qkv.", "attn.q.", "q"), @@ -718,7 +718,7 @@ class Qwen2_5_VisionTransformer(nn.Module): ("attn.qkv.", "attn.v.", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -750,7 +750,7 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo): min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, size: Optional[dict[str, int]] = None, - fps: Optional[Union[float, List[float]]] = None, + fps: Optional[Union[float, list[float]]] = None, **kwargs: object, ) -> Qwen2_5_VLProcessor: if fps is not None: @@ -1116,8 +1116,8 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index f30bf08ab18bf..3182a75325787 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -22,7 +22,7 @@ # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Optional, TypedDict, Union import torch import torch.nn as nn @@ -403,7 +403,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index ae1c146cf3f2c..7cf98dc7a4ea3 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -23,7 +23,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2MoE model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch import torch.nn.functional as F @@ -169,12 +170,12 @@ class Qwen2MoeAttention(nn.Module): num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", - dual_chunk_attention_config: Optional[Dict[str, Any]] = None, + dual_chunk_attention_config: Optional[dict[str, Any]] = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -389,8 +390,8 @@ class Qwen2MoeModel(nn.Module): hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -409,7 +410,7 @@ class Qwen2MoeModel(nn.Module): num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). @@ -532,8 +533,8 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["rotary_emb.inv_freq"]), diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 90f799e6734ed..81dc38988c9d9 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -5,7 +5,8 @@ # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. """Inference-only Qwen2-RM model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -95,8 +96,8 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP, ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["lm_head."]) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index ac0a6de523dfb..0ff0836b08975 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -25,8 +25,7 @@ """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping, Sequence from functools import partial -from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict, - Union) +from typing import Any, Callable, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -102,7 +101,7 @@ class Qwen2VLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] image_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all images' features. + - list[`torch.Tensor`]: A list of tensors holding all images' features. Each tensor holds an image's features. - `torch.Tensor`: A tensor holding all images' features (concatenation of all images' feature tensors). @@ -142,7 +141,7 @@ class Qwen2VLVideoEmbeddingInputs(TypedDict): type: Literal["video_embeds"] video_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all videos' features. + - list[`torch.Tensor`]: A list of tensors holding all videos' features. Each tensor holds an video's features. - `torch.Tensor`: A tensor holding all videos' features (concatenation of all videos' feature tensors). @@ -662,8 +661,8 @@ class Qwen2VisionTransformer(nn.Module): return x - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -671,7 +670,7 @@ class Qwen2VisionTransformer(nn.Module): ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -1394,8 +1393,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 40e0ccc1bab6b..dbe2be8a73d59 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen3 model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -63,7 +64,7 @@ class Qwen3Attention(nn.Module): rope_theta: float = 10000, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - rope_scaling: Optional[Tuple] = None, + rope_scaling: Optional[tuple] = None, prefix: str = "", attn_type: str = AttentionType.DECODER) -> None: super().__init__() @@ -201,7 +202,7 @@ class Qwen3DecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -309,8 +310,8 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 1fef37a96ea93..aae5401721df1 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen3MoE model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -149,7 +150,7 @@ class Qwen3MoeAttention(nn.Module): num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, head_dim: Optional[int] = None, rms_norm_eps: float = 1e-06, @@ -373,8 +374,8 @@ class Qwen3MoeModel(nn.Module): hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -393,7 +394,7 @@ class Qwen3MoeModel(nn.Module): num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). @@ -527,8 +528,8 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["rotary_emb.inv_freq"]), diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 199b885a58506..3701153bace53 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -9,10 +9,9 @@ import copy import math import re import unicodedata -from collections.abc import Collection, Mapping, Sequence -from collections.abc import Set as AbstractSet +from collections.abc import Collection, Mapping, Sequence, Set from functools import lru_cache, partial -from typing import Callable, List, Literal, Optional, TypedDict, Union +from typing import Callable, Literal, Optional, TypedDict, Union import torch from torch import nn @@ -395,7 +394,7 @@ def _get_tokenizer_without_image_pad( def tokenize( self, text: str, - allowed_special: Union[AbstractSet[str], str] = "all", + allowed_special: Union[Set[str], str] = "all", disallowed_special: Union[Collection[str], str] = (), **kwargs, ) -> list[Union[bytes, str]]: @@ -411,7 +410,7 @@ def _get_tokenizer_without_image_pad( def _decode( self, - token_ids: Union[int, List[int]], + token_ids: Union[int, list[int]], skip_special_tokens: bool = False, errors: Optional[str] = None, **kwargs, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 06a0e65746300..c55f7ccd344ff 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -10,10 +10,10 @@ import subprocess import sys import tempfile from abc import ABC, abstractmethod +from collections.abc import Set from dataclasses import dataclass, field from functools import lru_cache -from typing import (AbstractSet, Callable, Dict, List, Optional, Tuple, Type, - TypeVar, Union) +from typing import Callable, Optional, TypeVar, Union import cloudpickle import torch.nn as nn @@ -266,7 +266,7 @@ class _ModelInfo: supports_v0_only: bool @staticmethod - def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo": + def from_model_cls(model: type[nn.Module]) -> "_ModelInfo": return _ModelInfo( architecture=model.__name__, is_text_generation_model=is_text_generation_model(model), @@ -290,7 +290,7 @@ class _BaseRegisteredModel(ABC): raise NotImplementedError @abstractmethod - def load_model_cls(self) -> Type[nn.Module]: + def load_model_cls(self) -> type[nn.Module]: raise NotImplementedError @@ -301,10 +301,10 @@ class _RegisteredModel(_BaseRegisteredModel): """ interfaces: _ModelInfo - model_cls: Type[nn.Module] + model_cls: type[nn.Module] @staticmethod - def from_model_cls(model_cls: Type[nn.Module]): + def from_model_cls(model_cls: type[nn.Module]): return _RegisteredModel( interfaces=_ModelInfo.from_model_cls(model_cls), model_cls=model_cls, @@ -313,7 +313,7 @@ class _RegisteredModel(_BaseRegisteredModel): def inspect_model_cls(self) -> _ModelInfo: return self.interfaces - def load_model_cls(self) -> Type[nn.Module]: + def load_model_cls(self) -> type[nn.Module]: return self.model_cls @@ -330,7 +330,7 @@ class _LazyRegisteredModel(_BaseRegisteredModel): return _run_in_subprocess( lambda: _ModelInfo.from_model_cls(self.load_model_cls())) - def load_model_cls(self) -> Type[nn.Module]: + def load_model_cls(self) -> type[nn.Module]: mod = importlib.import_module(self.module_name) return getattr(mod, self.class_name) @@ -339,7 +339,7 @@ class _LazyRegisteredModel(_BaseRegisteredModel): def _try_load_model_cls( model_arch: str, model: _BaseRegisteredModel, -) -> Optional[Type[nn.Module]]: +) -> Optional[type[nn.Module]]: from vllm.platforms import current_platform current_platform.verify_model_arch(model_arch) try: @@ -366,15 +366,15 @@ def _try_inspect_model_cls( @dataclass class _ModelRegistry: # Keyed by model_arch - models: Dict[str, _BaseRegisteredModel] = field(default_factory=dict) + models: dict[str, _BaseRegisteredModel] = field(default_factory=dict) - def get_supported_archs(self) -> AbstractSet[str]: + def get_supported_archs(self) -> Set[str]: return self.models.keys() def register_model( self, model_arch: str, - model_cls: Union[Type[nn.Module], str], + model_cls: Union[type[nn.Module], str], ) -> None: """ Register an external model to be used in vLLM. @@ -413,7 +413,7 @@ class _ModelRegistry: self.models[model_arch] = model - def _raise_for_unsupported(self, architectures: List[str]): + def _raise_for_unsupported(self, architectures: list[str]): all_supported_archs = self.get_supported_archs() if any(arch in all_supported_archs for arch in architectures): @@ -426,7 +426,7 @@ class _ModelRegistry: f"Supported architectures: {all_supported_archs}") def _try_load_model_cls(self, - model_arch: str) -> Optional[Type[nn.Module]]: + model_arch: str) -> Optional[type[nn.Module]]: if model_arch not in self.models: return None @@ -440,8 +440,8 @@ class _ModelRegistry: def _normalize_archs( self, - architectures: Union[str, List[str]], - ) -> List[str]: + architectures: Union[str, list[str]], + ) -> list[str]: if isinstance(architectures, str): architectures = [architectures] if not architectures: @@ -458,8 +458,8 @@ class _ModelRegistry: def inspect_model_cls( self, - architectures: Union[str, List[str]], - ) -> Tuple[_ModelInfo, str]: + architectures: Union[str, list[str]], + ) -> tuple[_ModelInfo, str]: architectures = self._normalize_archs(architectures) for arch in architectures: @@ -471,8 +471,8 @@ class _ModelRegistry: def resolve_model_cls( self, - architectures: Union[str, List[str]], - ) -> Tuple[Type[nn.Module], str]: + architectures: Union[str, list[str]], + ) -> tuple[type[nn.Module], str]: architectures = self._normalize_archs(architectures) for arch in architectures: @@ -484,77 +484,77 @@ class _ModelRegistry: def is_text_generation_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_text_generation_model def is_pooling_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_pooling_model def is_cross_encoder_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_cross_encoding def is_multimodal_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_multimodal def is_pp_supported_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_pp def model_has_inner_state( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.has_inner_state def is_attention_free_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_attention_free def is_hybrid_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_hybrid def is_noops_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.has_noops def is_transcription_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_transcription def is_v1_compatible( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return not model_cls.supports_v0_only diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index ebefe7689c974..9a4d0ab2dd4d7 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import itertools -from typing import Iterable, Optional, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -135,7 +136,7 @@ class RobertaEmbeddingModel(BertEmbeddingModel): prefix=prefix, embedding_class=RobertaEmbedding) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) # Separate weights in "roberta"-prefixed and all else (not in memory). # For use with models like FacebookAI/roberta-base. @@ -187,7 +188,7 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding, self.classifier = RobertaClassificationHead(config) self._pooler = CrossEncodingPooler(config, self.classifier) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): bert_weights, task_weights = roberta_task_weights_filter(weights) bert_weights = self.jina_to_vllm_mapper.apply(bert_weights) @@ -249,8 +250,8 @@ def create_position_ids_from_input_ids(input_ids, def roberta_task_weights_filter( - all_weights: Iterable[Tuple[str, torch.Tensor]] -) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[str, + all_weights: Iterable[tuple[str, torch.Tensor]] +) -> tuple[Iterable[tuple[str, torch.Tensor]], Iterable[tuple[str, torch.Tensor]]]: """ Separate task-specific weights that are applied on top diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 75fcf540b0b12..3b5334afa7af8 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -3,7 +3,8 @@ within a vision language model.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -265,7 +266,7 @@ class SiglipEncoderLayer(nn.Module): def forward( self, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, None]: + ) -> tuple[torch.Tensor, None]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) @@ -480,8 +481,8 @@ class SiglipVisionModel(nn.Module): feature_sample_layers=feature_sample_layers, ) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -489,7 +490,7 @@ class SiglipVisionModel(nn.Module): ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.vision_model.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index e78c37b65f874..91f6c7753c68b 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -8,7 +8,7 @@ # -------------------------------------------------------- from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union +from typing import Literal, Optional, TypedDict, TypeVar, Union import torch import torch.nn as nn @@ -937,8 +937,8 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP): return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: skip_prefixes = [ "action_embed", "temporal_embed", "track_embed", "track_embed_decoder", "box_token", "cg_criterion", "cg_model", diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py index 17217dc9a2470..31dec55026bae 100644 --- a/vllm/model_executor/models/smolvlm.py +++ b/vllm/model_executor/models/smolvlm.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, Optional +from typing import Optional from transformers import SmolVLMProcessor @@ -21,7 +21,7 @@ class SmolVLMProcessingInfo(Idefics3ProcessingInfo): def get_hf_processor( self, *, - max_image_size: Optional[Dict[str, int]] = None, + max_image_size: Optional[dict[str, int]] = None, **kwargs: object, ) -> SmolVLMProcessor: if max_image_size is not None: diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index f86aff7ba7ef0..1c9f3c77c7a80 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -23,7 +23,8 @@ # limitations under the License. """Inference-only Solar model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -101,7 +102,7 @@ class SolarAttention(nn.Module): num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -236,7 +237,7 @@ class SolarDecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -437,8 +438,8 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -448,7 +449,7 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 1cbda7267e4c6..8c2ad6f192515 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -20,7 +20,8 @@ # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json """Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -180,7 +181,7 @@ class StablelmDecoderLayer(nn.Module): self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -252,8 +253,8 @@ class StableLMEpochModel(nn.Module): hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -263,7 +264,7 @@ class StableLMEpochModel(nn.Module): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -335,8 +336,8 @@ class StablelmForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, # Models trained using ColossalAI may include these tensors in diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 6eebe4c4d6145..5927afa91f491 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -19,7 +19,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch Starcoder2 model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -255,8 +256,8 @@ class Starcoder2Model(nn.Module): hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -265,7 +266,7 @@ class Starcoder2Model(nn.Module): ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -342,8 +343,8 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, # Models trained using ColossalAI may include these tensors in diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index 379e19e1beea1..7d713d23c772d 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -19,7 +19,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, Set, Tuple +from collections.abc import Iterable import torch import torch.nn as nn @@ -50,14 +50,14 @@ class TeleChat2Model(LlamaModel): layer.mlp.gate_up_proj.bias = None layer.mlp.gate_up_proj.skip_bias_add = True - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ ('gate_up_proj', 'gate_proj', 0), ('gate_up_proj', 'up_proj', 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() total_num_heads = self.config.n_head head_dim = self.config.hidden_size // total_num_heads for name, loaded_weight in weights: @@ -128,8 +128,8 @@ class TeleChat2ForCausalLM(LlamaForCausalLM): layer_type: type[nn.Module] = LlamaDecoderLayer): return TeleChat2Model(vllm_config=vllm_config, prefix=prefix) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 7b946ad6aac7d..a8f30b2f27bfe 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -15,7 +15,8 @@ # limitations under the License. """Wrapper around `transformers` models""" import re -from typing import Iterable, Literal, Optional, Union +from collections.abc import Iterable +from typing import Literal, Optional, Union import torch from torch import nn diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 0bc5d218f8d06..c1a4dc1b33d78 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -3,7 +3,7 @@ # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py """PyTorch Ultravox model.""" from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, TypedDict, Union import torch from torch import nn @@ -619,8 +619,8 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["audio_tower."]) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 0458e3ce03b5d..5cc501622891f 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import itertools +from collections.abc import Iterable, Mapping from dataclasses import dataclass, field -from typing import (Callable, Dict, Iterable, List, Literal, Mapping, Optional, - Protocol, Set, Tuple, Union, overload) +from typing import Callable, Literal, Optional, Protocol, Union, overload import torch import torch.nn as nn @@ -58,8 +58,8 @@ class WeightsMapper: return key def apply( - self, weights: Iterable[Tuple[str, torch.Tensor]] - ) -> Iterable[Tuple[str, torch.Tensor]]: + self, weights: Iterable[tuple[str, torch.Tensor]] + ) -> Iterable[tuple[str, torch.Tensor]]: return ((out_name, data) for name, data in weights if (out_name := self._map_name(name)) is not None) @@ -84,8 +84,8 @@ class AutoWeightsLoader: self, module: nn.Module, *, - skip_prefixes: Optional[List[str]] = None, - ignore_unexpected_prefixes: Optional[List[str]] = None, + skip_prefixes: Optional[list[str]] = None, + ignore_unexpected_prefixes: Optional[list[str]] = None, ) -> None: super().__init__() @@ -95,8 +95,8 @@ class AutoWeightsLoader: def _groupby_prefix( self, - weights: Iterable[Tuple[str, torch.Tensor]], - ) -> Iterable[Tuple[str, Iterable[Tuple[str, torch.Tensor]]]]: + weights: Iterable[tuple[str, torch.Tensor]], + ) -> Iterable[tuple[str, Iterable[tuple[str, torch.Tensor]]]]: weights_by_parts = ((weight_name.split(".", 1), weight_data) for weight_name, weight_data in weights) @@ -129,7 +129,7 @@ class AutoWeightsLoader: self, base_prefix: str, param: nn.Parameter, - weights: Iterable[Tuple[str, torch.Tensor]], + weights: Iterable[tuple[str, torch.Tensor]], ) -> Iterable[str]: for weight_name, weight_data in weights: weight_qualname = self._get_qualname(base_prefix, weight_name) @@ -159,7 +159,7 @@ class AutoWeightsLoader: yield weight_qualname def _add_loadable_non_param_tensors(self, module: nn.Module, - child_params: Dict[str, torch.Tensor]): + child_params: dict[str, torch.Tensor]): """ Add tensor names that are not in the model params that may be in the safetensors, e.g., batch normalization stats. @@ -182,7 +182,7 @@ class AutoWeightsLoader: self, base_prefix: str, module: nn.Module, - weights: Iterable[Tuple[str, torch.Tensor]], + weights: Iterable[tuple[str, torch.Tensor]], ) -> Iterable[str]: if isinstance(module, PPMissingLayer): return @@ -251,10 +251,10 @@ class AutoWeightsLoader: def load_weights( self, - weights: Iterable[Tuple[str, torch.Tensor]], + weights: Iterable[tuple[str, torch.Tensor]], *, mapper: Optional[WeightsMapper] = None, - ) -> Set[str]: + ) -> set[str]: if mapper is not None: weights = mapper.apply(weights) @@ -292,13 +292,13 @@ def flatten_bn(x: torch.Tensor) -> torch.Tensor: @overload -def flatten_bn(x: List[torch.Tensor]) -> List[torch.Tensor]: +def flatten_bn(x: list[torch.Tensor]) -> list[torch.Tensor]: ... @overload def flatten_bn( - x: Union[List[torch.Tensor], torch.Tensor], + x: Union[list[torch.Tensor], torch.Tensor], *, concat: Literal[True], ) -> torch.Tensor: @@ -307,18 +307,18 @@ def flatten_bn( @overload def flatten_bn( - x: Union[List[torch.Tensor], torch.Tensor], + x: Union[list[torch.Tensor], torch.Tensor], *, concat: bool = False, -) -> Union[List[torch.Tensor], torch.Tensor]: +) -> Union[list[torch.Tensor], torch.Tensor]: ... def flatten_bn( - x: Union[List[torch.Tensor], torch.Tensor], + x: Union[list[torch.Tensor], torch.Tensor], *, concat: bool = False, -) -> Union[List[torch.Tensor], torch.Tensor]: +) -> Union[list[torch.Tensor], torch.Tensor]: """ Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs. @@ -442,7 +442,7 @@ def merge_multimodal_embeddings( input_ids: torch.Tensor, inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors, - placeholder_token_id: Union[int, List[int]], + placeholder_token_id: Union[int, list[int]], ) -> torch.Tensor: """ Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the @@ -596,7 +596,7 @@ def make_layers( num_hidden_layers: int, layer_fn: LayerFn, prefix: str, -) -> Tuple[int, int, torch.nn.ModuleList]: +) -> tuple[int, int, torch.nn.ModuleList]: """Make a list of layers with the given layer function, taking pipeline parallelism into account. """ @@ -614,10 +614,10 @@ def make_layers( # NOTE: don't use lru_cache here because it can prevent garbage collection -_model_to_pp_missing_layer_names: Dict[int, List[str]] = {} +_model_to_pp_missing_layer_names: dict[int, list[str]] = {} -def get_pp_missing_layer_names(model: torch.nn.Module) -> List[str]: +def get_pp_missing_layer_names(model: torch.nn.Module) -> list[str]: """Get the names of the missing layers in a pipeline parallel model.""" model_id = id(model) if model_id in _model_to_pp_missing_layer_names: @@ -645,7 +645,7 @@ def is_pp_missing_parameter(name: str, model: torch.nn.Module) -> bool: for missing_layer_name in get_pp_missing_layer_names(model)) -def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int): +def make_empty_intermediate_tensors_factory(keys: list[str], hidden_size: int): def make_empty_intermediate_tensors( batch_size: int, @@ -684,7 +684,7 @@ def extract_layer_index(layer_name: str) -> int: - "model.encoder.layers.0.sub.1" -> ValueError """ subnames = layer_name.split(".") - int_vals: List[int] = [] + int_vals: list[int] = [] for subname in subnames: try: int_vals.append(int(subname)) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 908cd7885aa83..c6e303d6024a4 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -2,7 +2,7 @@ import math from collections.abc import Iterable, Mapping, Sequence -from typing import List, Optional, Set, Tuple, TypedDict, Union +from typing import Optional, TypedDict, Union import torch from torch import nn @@ -382,7 +382,7 @@ class WhisperEncoder(nn.Module): self.embed_positions.weight.copy_( sinusoids(*self.embed_positions.weight.shape)) - def forward(self, input_features: Union[torch.Tensor, List[torch.Tensor]]): + def forward(self, input_features: Union[torch.Tensor, list[torch.Tensor]]): hidden_states = [] for features in input_features: embeds = nn.functional.gelu(self.conv1(features)) @@ -460,7 +460,7 @@ class WhisperModel(nn.Module): def forward( self, - input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], + input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]], input_ids: Optional[torch.Tensor], positions: torch.Tensor, ) -> torch.Tensor: @@ -474,14 +474,14 @@ class WhisperModel(nn.Module): def get_encoder_outputs( self, - input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], + input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]], ) -> Optional[torch.Tensor]: if input_features is None: return None return self.encoder(input_features) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), @@ -491,7 +491,7 @@ class WhisperModel(nn.Module): (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: @@ -722,8 +722,8 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."]) # add fake zeros bias for k_proj to state_dict @@ -732,8 +732,8 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription, def _create_fake_bias_for_k_proj( - weights: Iterable[Tuple[str, torch.Tensor]] -) -> Iterable[Tuple[str, torch.Tensor]]: + weights: Iterable[tuple[str, torch.Tensor]] +) -> Iterable[tuple[str, torch.Tensor]]: """ Create full zeros bias for k_proj weight in self-attn and x-attn layers. So that the bias for k_proj in qkv_proj can be initialized with zeros. diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index eddccbba5a2db..48e254bdd85bd 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -6,8 +6,9 @@ https://arxiv.org/abs/2411.15242, which combines Mamba and Transformer architectures in a hybrid model optimized for efficient sequence modeling. The model alternates between state space model layers and attention-based layers. """ +from collections.abc import Iterable from itertools import cycle -from typing import Dict, Iterable, List, Optional, Set, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -54,7 +55,7 @@ class Zamba2LoRA(nn.Module): self, input_dim: int, rank: int, - output_dim: Union[int, List[int]], + output_dim: Union[int, list[int]], quant_config: Optional[QuantizationConfig] = None, ): """Initialize the attention layer. @@ -279,7 +280,7 @@ class Zamba2MLP(nn.Module): self, config: Zamba2Config, bare_block_idx: int, - num_hybrid_layers: Dict[int, int], + num_hybrid_layers: dict[int, int], quant_config: Optional[QuantizationConfig] = None, ) -> None: """Initialize the MLP layer. @@ -769,8 +770,8 @@ class Zamba2Model(nn.Module): hidden_states = self.final_layernorm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -779,7 +780,7 @@ class Zamba2Model(nn.Module): ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for chkpt_weight_name, loaded_weight in weights: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in chkpt_weight_name: @@ -914,9 +915,9 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only): return hidden_states - def copy_inputs_before_cuda_graphs(self, input_buffers: Dict[str, + def copy_inputs_before_cuda_graphs(self, input_buffers: dict[str, torch.Tensor], - **kwargs) -> Dict[str, torch.Tensor]: + **kwargs) -> dict[str, torch.Tensor]: """Copy inputs before CUDA graph capture. Args: @@ -930,7 +931,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only): input_buffers, **kwargs) def get_seqlen_agnostic_capture_inputs( - self, batch_size: int) -> Dict[str, torch.Tensor]: + self, batch_size: int) -> dict[str, torch.Tensor]: """Get inputs for sequence-length-agnostic graph capture. Args: @@ -941,7 +942,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: """Calculate shapes for Mamba's convolutional and state caches. Returns: @@ -1001,7 +1002,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only): sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)