mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-01 05:47:03 +08:00
Update deprecated type hinting in models (#18132)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
83f74c698f
commit
26d0419309
@ -77,7 +77,6 @@ exclude = [
|
||||
"vllm/engine/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/executor/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/model_executor/model_loader/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/model_executor/models/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
|
||||
"vllm/worker/**/*.py" = ["UP006", "UP035"]
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Inference-only Snowflake Arctic model."""
|
||||
from typing import Iterable, List, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -458,8 +459,8 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -467,8 +468,8 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
]
|
||||
|
||||
mlp_params_mapping: List[Tuple[str, str, int]] = []
|
||||
expert_params_mapping: List[Tuple[str, str, int]] = []
|
||||
mlp_params_mapping: list[tuple[str, str, int]] = []
|
||||
expert_params_mapping: list[tuple[str, str, int]] = []
|
||||
num_layers = self.config.num_hidden_layers
|
||||
|
||||
for layer in range(num_layers):
|
||||
@ -497,7 +498,7 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
|
||||
("ws", f"experts.{expert_id}.w3.weight", expert_id))
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
|
||||
logger.info(
|
||||
"It will take ~10 minutes loading from the 16-bit weights. "
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import List, Optional, Set, Tuple, TypedDict, Union
|
||||
from typing import Optional, TypedDict, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -66,8 +66,8 @@ class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
|
||||
# Identity layer
|
||||
self.post_layernorm = nn.Identity()
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -75,7 +75,7 @@ class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
|
||||
# NOTE: post_layernorm is not used in Aria
|
||||
@ -326,8 +326,8 @@ class AriaTextModel(LlamaModel, SupportsQuant):
|
||||
|
||||
# Adapted from LlamaModel.load_weights with the modification of adding
|
||||
# the expert weights mapping to `stacked_params_mapping`
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
(".qkv_proj", ".q_proj", "q"),
|
||||
@ -339,7 +339,7 @@ class AriaTextModel(LlamaModel, SupportsQuant):
|
||||
("experts.w2_weight", "experts.fc2.weight", 'w2'),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
@ -528,7 +528,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
|
||||
self.vocab_size, logit_scale)
|
||||
|
||||
def _validate_image_sizes(
|
||||
self, images: List[torch.Tensor]) -> List[torch.Tensor]:
|
||||
self, images: list[torch.Tensor]) -> list[torch.Tensor]:
|
||||
if not all(img.shape == images[0].shape for img in images):
|
||||
raise ValueError("All images must be the same size")
|
||||
return images
|
||||
@ -578,7 +578,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
|
||||
|
||||
def _process_image_input(
|
||||
self, image_input: AriaImagePixelInputs
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
assert self.vision_tower is not None
|
||||
|
||||
pixel_values = image_input['pixel_values']
|
||||
@ -651,6 +651,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
loader = AutoWeightsLoader(self)
|
||||
loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0 Adapted from
|
||||
# https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
|
||||
from typing import (Iterable, Literal, Mapping, Optional, Sequence, Set, Tuple,
|
||||
TypedDict, Union, cast)
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Literal, Optional, TypedDict, Union, cast
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -315,8 +315,8 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
def dtype(self):
|
||||
return next(self.parameters()).dtype
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
|
||||
@ -20,7 +20,8 @@
|
||||
# limitations under the License.
|
||||
"""Inference-only BaiChuan model compatible with HuggingFace weights."""
|
||||
import math
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -230,7 +231,7 @@ class BaiChuanDecoderLayer(nn.Module):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
@ -320,15 +321,15 @@ class BaiChuanModel(nn.Module):
|
||||
hidden_states, _ = self.norm(hidden_states, residual)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("gate_up_proj", "gate_proj", 0),
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
@ -421,8 +422,8 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Inference-only Bamba model."""
|
||||
# Added by the IBM Team, 2024
|
||||
from typing import Iterable, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -355,8 +356,8 @@ class BambaModel(nn.Module):
|
||||
hidden_states, _ = self.final_layernorm(hidden_states, residual)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -367,7 +368,7 @@ class BambaModel(nn.Module):
|
||||
]
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
@ -495,7 +496,7 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
|
||||
return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
|
||||
|
||||
def _get_mamba_cache_shape(
|
||||
self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
|
||||
self) -> tuple[tuple[int, int], tuple[int, int]]:
|
||||
world_size = get_tensor_model_parallel_world_size()
|
||||
hidden_size = self.config.hidden_size
|
||||
|
||||
@ -535,7 +536,7 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -19,7 +19,8 @@
|
||||
# limitations under the License.
|
||||
"""PyTorch BART model."""
|
||||
import math
|
||||
from typing import Iterable, Optional, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -859,14 +860,14 @@ class BartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant):
|
||||
def _rename_stacked_param(
|
||||
self,
|
||||
name: str,
|
||||
) -> Tuple[str, Optional[str]]:
|
||||
) -> tuple[str, Optional[str]]:
|
||||
for key, mapping in self.stacked_params_mapping.items():
|
||||
if key in name:
|
||||
name = name.replace(key, mapping["param_name"])
|
||||
return name, mapping["shard_id"]
|
||||
return name, None
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
|
||||
model_params_dict = dict(self.model.named_parameters())
|
||||
top_params_dict = dict(self.named_parameters())
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import Iterable, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -349,8 +350,8 @@ class BertModel(nn.Module, SupportsQuant):
|
||||
token_type_ids=token_type_ids)
|
||||
return self.encoder(hidden_states)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "query", "q"),
|
||||
@ -359,7 +360,7 @@ class BertModel(nn.Module, SupportsQuant):
|
||||
]
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if self.pooler is None and "pooler" in name:
|
||||
continue
|
||||
@ -424,7 +425,7 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
|
||||
) -> Optional[PoolerOutput]:
|
||||
return self._pooler(hidden_states, pooling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
weights = self.hf_to_vllm_mapper.apply(weights)
|
||||
weights = ((name, data) for name, data in weights
|
||||
if not name.startswith("lm_head."))
|
||||
@ -472,7 +473,7 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
|
||||
self._pooler = CrossEncodingPooler(config, self.classifier,
|
||||
self.bert.pooler)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
|
||||
self_weights = []
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from typing import Iterable, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -208,7 +209,7 @@ class NomicRouter(nn.Module):
|
||||
|
||||
def forward(
|
||||
self, x: torch.Tensor
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
|
||||
weights = self.layer(x.view(-1, x.shape[-1]))[0].softmax(
|
||||
dim=-1, dtype=torch.float32)
|
||||
top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1)
|
||||
@ -428,8 +429,8 @@ class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant):
|
||||
token_type_ids=token_type_ids)
|
||||
return self.encoder(positions, hidden_states)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
weights = self.hf_to_vllm_mapper.apply(weights)
|
||||
|
||||
if self.config.hidden_act in ["silu", "geglu"]:
|
||||
@ -442,7 +443,7 @@ class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant):
|
||||
stacked_params_mapping = []
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "pooler" in name:
|
||||
continue
|
||||
@ -567,7 +568,7 @@ class GteNewModel(BertWithRope):
|
||||
}
|
||||
return config
|
||||
|
||||
def split_up_gate_proj(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
def split_up_gate_proj(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
n = "mlp.up_gate_proj"
|
||||
for name, weight in weights:
|
||||
if n in name:
|
||||
@ -578,14 +579,14 @@ class GteNewModel(BertWithRope):
|
||||
yield name, weight
|
||||
|
||||
def ignore_unnecessary_layers(self,
|
||||
weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
for name, weight in weights:
|
||||
if name.startswith("classifier"):
|
||||
continue
|
||||
yield name, weight
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
weights = self.ignore_unnecessary_layers(weights)
|
||||
weights = self.split_up_gate_proj(weights)
|
||||
return super().load_weights(weights)
|
||||
@ -664,7 +665,7 @@ class JinaRobertaModel(BertWithRope):
|
||||
token_type_ids=token_type_ids)
|
||||
|
||||
@torch.inference_mode()
|
||||
def jina_merge_lora_weights(self, weights: Iterable[Tuple[str,
|
||||
def jina_merge_lora_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]):
|
||||
# use for jina-embeddings-v3
|
||||
# Merge Lora weights into a single weight tensor.
|
||||
@ -707,7 +708,7 @@ class JinaRobertaModel(BertWithRope):
|
||||
|
||||
return [(name, weight) for name, weight in weights.items()]
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
weights = self.jina_merge_lora_weights(weights)
|
||||
return super().load_weights(weights)
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Minimal implementation of BlipVisionModel intended to be only used
|
||||
within a vision language model."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -296,8 +297,8 @@ class BlipVisionModel(nn.Module, SupportsQuant):
|
||||
|
||||
return self.post_layernorm(hidden_states)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -305,7 +306,7 @@ class BlipVisionModel(nn.Module, SupportsQuant):
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
layer_count = len(self.encoder.layers)
|
||||
|
||||
for name, loaded_weight in weights:
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Literal, Optional, Set, Tuple, TypedDict, Union
|
||||
from typing import Literal, Optional, TypedDict, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -186,7 +186,7 @@ class Blip2QFormerAttention(nn.Module):
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
||||
) -> Tuple[torch.Tensor]:
|
||||
) -> tuple[torch.Tensor]:
|
||||
self_output = self.attention(
|
||||
hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
@ -712,7 +712,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
|
||||
return self.language_model.compute_logits(hidden_states,
|
||||
sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -18,7 +18,8 @@
|
||||
# limitations under the License.
|
||||
"""Inference-only BLOOM model compatible with HuggingFace weights."""
|
||||
import math
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -322,10 +323,10 @@ class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only, SupportsQuant):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if name == "lm_head.weight":
|
||||
continue
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from functools import cached_property
|
||||
from typing import Any, Dict, Literal, Optional, Set, Tuple, TypedDict, Union
|
||||
from typing import Any, Literal, Optional, TypedDict, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -229,7 +229,7 @@ class ChameleonAttention(nn.Module):
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 4096,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
bias: bool = False,
|
||||
@ -292,7 +292,7 @@ class ChameleonAttention(nn.Module):
|
||||
prefix=f"{prefix}.attn")
|
||||
|
||||
def _apply_qk_norm(self, q: torch.Tensor,
|
||||
k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# reshape for layernorm
|
||||
q = q.reshape(-1, self.num_heads, self.head_dim)
|
||||
k = k.reshape(-1, self.num_kv_heads, self.head_dim)
|
||||
@ -367,7 +367,7 @@ class ChameleonDecoderLayer(nn.Module):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
@ -438,7 +438,7 @@ class ChameleonSwinDecoderLayer(nn.Module):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
|
||||
residual = hidden_states
|
||||
hidden_states = self.self_attn(
|
||||
@ -773,7 +773,7 @@ class ChameleonVQVAE(nn.Module):
|
||||
|
||||
def encode(
|
||||
self, pixel_values: torch.Tensor
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
hidden_states = self.encoder(pixel_values)
|
||||
hidden_states = self.quant_conv(hidden_states)
|
||||
quant, emb_loss, indices = self.quantize(hidden_states)
|
||||
@ -786,7 +786,7 @@ class ChameleonImageVocabularyMapping:
|
||||
A class for mapping discrete image tokens from VQGAN to BPE tokens.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab_map: Dict[str, int]):
|
||||
def __init__(self, vocab_map: dict[str, int]):
|
||||
self.vocab_map = vocab_map
|
||||
self.image_token_id = vocab_map.get("<image>")
|
||||
|
||||
@ -1052,8 +1052,8 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
(".qkv_proj", ".q_proj", "q"),
|
||||
@ -1063,7 +1063,7 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
(".gate_up_proj", ".up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
|
||||
@ -3,7 +3,8 @@
|
||||
# https://github.com/THUDM/ChatGLM2-6B
|
||||
"""Inference-only ChatGLM model compatible with THUDM weights."""
|
||||
import json
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -358,15 +359,15 @@ class ChatGLMModel(nn.Module, SupportsQuant):
|
||||
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("linear_proj.merged_proj", "linear_proj.gate_proj", 0),
|
||||
("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
|
||||
for name, loaded_weight in weights:
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
@ -440,7 +441,7 @@ class ChatGLMBaseModel(nn.Module):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
|
||||
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Minimal implementation of CLIPVisionModel intended to be only used
|
||||
within a vision language model."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -368,8 +369,8 @@ class CLIPVisionModel(nn.Module, SupportsQuant):
|
||||
|
||||
# (TODO) Add prefix argument for filtering out weights to be loaded
|
||||
# ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -377,7 +378,7 @@ class CLIPVisionModel(nn.Module, SupportsQuant):
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
layer_count = len(self.vision_model.encoder.layers)
|
||||
|
||||
for name, loaded_weight in weights:
|
||||
|
||||
@ -21,7 +21,8 @@
|
||||
|
||||
# This file is based on the LLama model definition file in transformers
|
||||
"""PyTorch Cohere model."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -259,7 +260,7 @@ class CohereDecoderLayer(nn.Module):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
residual = hidden_states
|
||||
hidden_states, residual = self.input_layernorm(hidden_states, residual)
|
||||
@ -404,8 +405,8 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
|
||||
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -415,7 +416,7 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
|
||||
# Skip loading rotary embeddings since vLLM has its own
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List, Tuple
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
|
||||
@ -16,7 +16,7 @@ class ConstantSizeCache(ABC):
|
||||
def __init__(self, max_batch_size: int):
|
||||
# Maps between the request id and a dict that maps between the seq_id
|
||||
# and its index inside the cache
|
||||
self.cache_indices_mapping: Dict[str, Dict[int, int]] = {}
|
||||
self.cache_indices_mapping: dict[str, dict[int, int]] = {}
|
||||
self.free_cache_indices = list(range(max_batch_size))
|
||||
|
||||
@property
|
||||
@ -30,7 +30,7 @@ class ConstantSizeCache(ABC):
|
||||
"""Copy cache data from one index to another"""
|
||||
pass
|
||||
|
||||
def current_run_tensors(self, **kwargs) -> Tuple:
|
||||
def current_run_tensors(self, **kwargs) -> tuple:
|
||||
"""
|
||||
Return the tensors for the current run's conv and ssm state.
|
||||
"""
|
||||
@ -117,8 +117,8 @@ class ConstantSizeCache(ABC):
|
||||
return self.cache_indices_mapping[cur_rid][seq_id]
|
||||
|
||||
def _prepare_current_run_cache(
|
||||
self, request_ids_to_seq_ids: Dict[str, list[int]],
|
||||
finished_requests_ids: List[str]) -> List[int]:
|
||||
self, request_ids_to_seq_ids: dict[str, list[int]],
|
||||
finished_requests_ids: list[str]) -> list[int]:
|
||||
return [
|
||||
self._assign_seq_id_to_cache_index(req_id, seq_id,
|
||||
finished_requests_ids)
|
||||
@ -127,7 +127,7 @@ class ConstantSizeCache(ABC):
|
||||
]
|
||||
|
||||
def _release_finished_requests(self,
|
||||
finished_seq_groups_req_ids: List[str]):
|
||||
finished_seq_groups_req_ids: list[str]):
|
||||
for req_id in finished_seq_groups_req_ids:
|
||||
if req_id in self.cache_indices_mapping:
|
||||
for seq_id in self.cache_indices_mapping[req_id]:
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -414,14 +415,14 @@ class DbrxForCausalLM(nn.Module, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
expert_params_mapping = [(
|
||||
"w13" if weight_name in ["w1", "v1"] else "w2",
|
||||
f"mlp.{weight_name}",
|
||||
) for weight_name in ["w1", "v1", "w2"]]
|
||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
|
||||
for name, loaded_weight in weights:
|
||||
if (self.quant_config is not None and
|
||||
|
||||
@ -22,7 +22,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only Deepseek model."""
|
||||
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -184,7 +185,7 @@ class DeepseekAttention(nn.Module):
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
@ -385,8 +386,8 @@ class DeepseekModel(nn.Module):
|
||||
hidden_states, _ = self.norm(hidden_states, residual)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -397,7 +398,7 @@ class DeepseekModel(nn.Module):
|
||||
]
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
@ -478,7 +479,7 @@ class DeepseekForCausalLM(nn.Module, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from typing import Iterable, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -176,8 +177,8 @@ class DeepSeekMTP(nn.Module):
|
||||
return self.model.compute_logits(hidden_states, sampling_metadata,
|
||||
spec_step_idx)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
("gate_up_proj", "gate_proj", 0),
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
@ -190,7 +191,7 @@ class DeepSeekMTP(nn.Module):
|
||||
num_experts=self.config.n_routed_experts)
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
|
||||
@ -22,7 +22,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only DeepseekV2/DeepseekV3 model."""
|
||||
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -200,7 +201,7 @@ class DeepseekV2Attention(nn.Module):
|
||||
q_lora_rank: int,
|
||||
kv_lora_rank: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
@ -352,7 +353,7 @@ class DeepseekV2MLAAttention(nn.Module):
|
||||
q_lora_rank: Optional[int],
|
||||
kv_lora_rank: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
@ -736,8 +737,8 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
|
||||
device=device),
|
||||
})
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("gate_up_proj", "gate_proj", 0),
|
||||
@ -753,7 +754,7 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
|
||||
num_experts=self.config.n_routed_experts)
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
"""Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
|
||||
from typing import Literal, Optional, TypedDict, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -45,7 +45,7 @@ _IMAGE_TOKEN = "<image>"
|
||||
|
||||
class DeepseekVL2ImagePixelInputs(TypedDict):
|
||||
type: Literal["pixel_values"]
|
||||
data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
"""
|
||||
Shape: `(batch_size * num_images, num_channels, height, width)`
|
||||
"""
|
||||
@ -57,7 +57,7 @@ class DeepseekVL2ImagePixelInputs(TypedDict):
|
||||
|
||||
class DeepseekVL2VImageEmbeddingInputs(TypedDict):
|
||||
type: Literal["image_embeds"]
|
||||
data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
|
||||
|
||||
`hidden_size` must match the hidden size of language model backbone.
|
||||
@ -394,8 +394,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
return model
|
||||
|
||||
def _validate_pixel_values(
|
||||
self, data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, List[torch.Tensor]]:
|
||||
self, data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
|
||||
h = w = self.vision_config.image_size
|
||||
expected_dims = (3, h, w)
|
||||
@ -415,8 +415,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
return data
|
||||
|
||||
def _validate_images_spatial_crop(
|
||||
self, data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, List[torch.Tensor]]:
|
||||
self, data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
expected_dims = 2
|
||||
|
||||
def _validate_shape(d: torch.Tensor):
|
||||
@ -640,8 +640,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
return self.language_model.compute_logits(hidden_states,
|
||||
sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
|
||||
loader = AutoWeightsLoader(self)
|
||||
autoloaded_weights = loader.load_weights(weights,
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import Iterable, Optional, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -183,7 +184,7 @@ class EAGLE(nn.Module):
|
||||
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
# This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
|
||||
# due to missing lm_head weights and its config being that of a
|
||||
# Llama model. Here's a compatible version with the same weights:
|
||||
|
||||
@ -24,7 +24,8 @@
|
||||
# limitations under the License.
|
||||
"""Inference-only Exaone model compatible with HuggingFace weights."""
|
||||
|
||||
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -102,7 +103,7 @@ class ExaoneAttention(nn.Module):
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
bias: bool = False,
|
||||
@ -196,7 +197,7 @@ class ExaoneBlockAttention(nn.Module):
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
bias: bool = False,
|
||||
@ -282,7 +283,7 @@ class ExaoneDecoderLayer(nn.Module):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
@ -384,8 +385,8 @@ class ExaoneModel(nn.Module):
|
||||
hidden_states, _ = self.ln_f(hidden_states, residual)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
(".qkv_proj", ".q_proj", "q"),
|
||||
@ -395,7 +396,7 @@ class ExaoneModel(nn.Module):
|
||||
(".gate_up_proj", ".c_fc_1", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
@ -535,8 +536,8 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
# With tie_word_embeddings, we can skip lm_head.weight
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
# limitations under the License.
|
||||
"""Llama model for fairseq2 weights."""
|
||||
|
||||
from typing import Iterable, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
|
||||
import torch
|
||||
from torch.nn import Parameter
|
||||
@ -44,8 +44,8 @@ class Fairseq2LlamaForCausalLM(LlamaForCausalLM):
|
||||
f"model.{self.tp_rank}.pt",
|
||||
]
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
# fairseq2's serialization adds a wrapper to usual .pt state_dict's:
|
||||
# { "model_key": my_model_name, "my_model_name": state_dict }
|
||||
# which we first need to unpack
|
||||
@ -102,7 +102,7 @@ class Fairseq2LlamaForCausalLM(LlamaForCausalLM):
|
||||
name: str,
|
||||
loaded_weight: torch.Tensor,
|
||||
params: dict[str, Parameter],
|
||||
) -> Tuple[str, torch.Tensor]:
|
||||
) -> tuple[str, torch.Tensor]:
|
||||
"""Reshape fairseq2's weights."""
|
||||
|
||||
def permute(w: torch.Tensor, n_heads: int) -> torch.Tensor:
|
||||
|
||||
@ -20,7 +20,8 @@
|
||||
"""PyTorch Falcon model."""
|
||||
|
||||
import math
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -394,8 +395,8 @@ class FalconModel(nn.Module):
|
||||
hidden_states = self.ln_f(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
total_num_heads = self.config.num_attention_heads
|
||||
if self.config.new_decoder_architecture:
|
||||
total_num_kv_heads = self.config.num_kv_heads
|
||||
@ -405,7 +406,7 @@ class FalconModel(nn.Module):
|
||||
total_num_kv_heads = total_num_heads
|
||||
num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
|
||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
@ -498,8 +499,8 @@ class FalconForCausalLM(nn.Module, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["lm_head."]
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
import math
|
||||
from collections import OrderedDict
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
|
||||
from typing import Literal, Optional, TypedDict, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -713,8 +713,8 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -723,7 +723,7 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
|
||||
]
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
@ -922,8 +922,8 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
'Florence2 only supports COSINE as temporal embedding.')
|
||||
|
||||
def _validate_pixel_values(
|
||||
self, data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, List[torch.Tensor]]:
|
||||
self, data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
|
||||
size = self.processor_config["size"]
|
||||
h, w = size["height"], size["width"]
|
||||
@ -944,12 +944,12 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
return data
|
||||
|
||||
def _parse_and_validate_image_input(self, **kwargs: object):
|
||||
pixel_values: Optional[Union[List[List[torch.Tensor]],
|
||||
List[torch.Tensor],
|
||||
pixel_values: Optional[Union[list[list[torch.Tensor]],
|
||||
list[torch.Tensor],
|
||||
torch.Tensor]] = kwargs.pop(
|
||||
"pixel_values", None)
|
||||
image_embeds: Optional[Union[List[List[torch.Tensor]],
|
||||
List[torch.Tensor],
|
||||
image_embeds: Optional[Union[list[list[torch.Tensor]],
|
||||
list[torch.Tensor],
|
||||
torch.Tensor]] = kwargs.pop(
|
||||
"image_embeds", None)
|
||||
|
||||
@ -1096,7 +1096,7 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
return self.language_model.compute_logits(hidden_states,
|
||||
sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -18,7 +18,7 @@
|
||||
""" PyTorch Fuyu model."""
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Literal, Optional, Set, Tuple, TypedDict
|
||||
from typing import Literal, Optional, TypedDict
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -382,7 +382,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
self.language_model.lm_head, hidden_states, sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -15,8 +15,9 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only Gemma model compatible with HuggingFace weights."""
|
||||
from collections.abc import Iterable
|
||||
from functools import cache
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -231,7 +232,7 @@ class GemmaDecoderLayer(nn.Module):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
@ -318,8 +319,8 @@ class GemmaModel(nn.Module):
|
||||
hidden_states, _ = self.norm(hidden_states, residual)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -329,7 +330,7 @@ class GemmaModel(nn.Module):
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
for (param_name, shard_name, shard_id) in stacked_params_mapping:
|
||||
if shard_name not in name:
|
||||
@ -413,8 +414,8 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["lm_head."]
|
||||
|
||||
@ -15,7 +15,8 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -218,7 +219,7 @@ class Gemma2DecoderLayer(nn.Module):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
@ -305,8 +306,8 @@ class Gemma2Model(nn.Module):
|
||||
hidden_states, _ = self.norm(hidden_states, residual)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -316,7 +317,7 @@ class Gemma2Model(nn.Module):
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if (self.quant_config is not None and
|
||||
(scale_name := self.quant_config.get_cache_scale(name))):
|
||||
@ -413,8 +414,8 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["lm_head."]
|
||||
|
||||
@ -14,7 +14,8 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
@ -320,7 +321,7 @@ class Gemma3DecoderLayer(nn.Module):
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
**kwargs,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
@ -412,8 +413,8 @@ class Gemma3Model(nn.Module):
|
||||
hidden_states, _ = self.norm(hidden_states, residual)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -423,7 +424,7 @@ class Gemma3Model(nn.Module):
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if (self.quant_config is not None and
|
||||
(scale_name := self.quant_config.get_cache_scale(name))):
|
||||
@ -521,8 +522,8 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["lm_head."]
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Any, Literal, Optional, Set, Tuple, TypedDict
|
||||
from typing import Any, Literal, Optional, TypedDict
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -701,8 +701,8 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
|
||||
return self.language_model.compute_logits(hidden_states,
|
||||
sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
|
||||
@ -21,7 +21,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only GLM-4-0414 model compatible with HuggingFace weights."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -60,7 +61,7 @@ class Glm4Attention(nn.Module):
|
||||
rope_theta: float = 10000,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
rope_scaling: Optional[Tuple] = None,
|
||||
rope_scaling: Optional[tuple] = None,
|
||||
prefix: str = "",
|
||||
attn_type: str = AttentionType.DECODER) -> None:
|
||||
super().__init__()
|
||||
@ -183,7 +184,7 @@ class Glm4DecoderLayer(nn.Module):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
@ -293,8 +294,8 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["lm_head."]
|
||||
|
||||
@ -18,7 +18,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only GPT-2 model compatible with HuggingFace weights."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -280,10 +281,10 @@ class GPT2LMHeadModel(nn.Module, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if ".attn.bias" in name or ".attn.masked_bias" in name:
|
||||
# Skip attention mask.
|
||||
|
||||
@ -19,7 +19,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only GPTBigCode model compatible with HuggingFace weights."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -243,10 +244,10 @@ class GPTBigCodeModel(nn.Module):
|
||||
hidden_states = self.ln_f(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if ".attn.bias" in name:
|
||||
# Skip attention mask.
|
||||
@ -327,8 +328,8 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["lm_head."]),
|
||||
|
||||
@ -17,7 +17,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only GPT-J model compatible with HuggingFace weights."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -228,8 +229,8 @@ class GPTJModel(nn.Module):
|
||||
hidden_states = self.ln_f(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -239,7 +240,7 @@ class GPTJModel(nn.Module):
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "attn.bias" in name or "attn.masked_bias" in name:
|
||||
continue
|
||||
@ -331,7 +332,7 @@ class GPTJForCausalLM(nn.Module, SupportsPP):
|
||||
sampling_metadata, self.lm_head.bias)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
@ -17,7 +17,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only GPT-NeoX model compatible with HuggingFace weights."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -240,10 +241,10 @@ class GPTNeoXModel(nn.Module):
|
||||
hidden_states = self.final_layer_norm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if ("attention.bias" in name or "attention.masked_bias" in name
|
||||
or "rotary_emb.inv_freq" in name):
|
||||
@ -324,7 +325,7 @@ class GPTNeoXForCausalLM(nn.Module, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -22,7 +22,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only IBM Granite model compatible with HuggingFace weights."""
|
||||
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -97,7 +98,7 @@ class GraniteAttention(nn.Module):
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
bias: bool = False,
|
||||
@ -230,7 +231,7 @@ class GraniteDecoderLayer(nn.Module):
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
@ -321,8 +322,8 @@ class GraniteModel(nn.Module):
|
||||
hidden_states = self.norm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
(".qkv_proj", ".q_proj", "q"),
|
||||
@ -332,7 +333,7 @@ class GraniteModel(nn.Module):
|
||||
(".gate_up_proj", ".up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if (self.quant_config is not None and
|
||||
(scale_name := self.quant_config.get_cache_scale(name))):
|
||||
@ -475,8 +476,8 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
device=device),
|
||||
})
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
skip_prefixes = [
|
||||
"rotary_emb.inv_freq",
|
||||
# Models trained using ColossalAI may include these tensors in
|
||||
|
||||
@ -23,7 +23,8 @@
|
||||
# limitations under the License.
|
||||
"""Inference-only IBM Granite speeech model."""
|
||||
import math
|
||||
from typing import Iterable, Mapping, Optional, Set, Tuple, TypedDict, Union
|
||||
from collections.abc import Iterable, Mapping
|
||||
from typing import Optional, TypedDict, Union
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
@ -763,8 +764,8 @@ class GraniteSpeechForConditionalGeneration(
|
||||
|
||||
def load_weights(
|
||||
self,
|
||||
weights: Iterable[Tuple[str, torch.Tensor]],
|
||||
) -> Set[str]:
|
||||
weights: Iterable[tuple[str, torch.Tensor]],
|
||||
) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
|
||||
@ -22,7 +22,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only GraniteMoe model."""
|
||||
from typing import Iterable, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -305,8 +306,8 @@ class GraniteMoeModel(nn.Module):
|
||||
hidden_states = self.norm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
new_weights = {}
|
||||
for n, p in weights:
|
||||
if n.endswith('.block_sparse_moe.input_linear.weight'):
|
||||
@ -425,8 +426,8 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
device=device),
|
||||
})
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["lm_head."]
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Inference-only GraniteMoeHybrid model."""
|
||||
# Added by the IBM Team, 2025
|
||||
from typing import Iterable, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -381,10 +382,10 @@ class GraniteMoeHybridModel(nn.Module):
|
||||
hidden_states = self.norm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
|
||||
def _load(n, p):
|
||||
param = params_dict[n]
|
||||
@ -538,7 +539,7 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA,
|
||||
return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
|
||||
|
||||
def _get_mamba_cache_shape(
|
||||
self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
|
||||
self) -> tuple[tuple[int, int], tuple[int, int]]:
|
||||
world_size = get_tensor_model_parallel_world_size()
|
||||
hidden_size = self.config.hidden_size
|
||||
|
||||
@ -578,7 +579,7 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA,
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -4,7 +4,8 @@
|
||||
The architecture is the same as granitemoe but with the addition of shared
|
||||
experts.
|
||||
"""
|
||||
from typing import Iterable, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -208,8 +209,8 @@ class GraniteMoeSharedModel(nn.Module):
|
||||
hidden_states = self.norm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
new_weights = {}
|
||||
for n, p in weights:
|
||||
if n.endswith('.block_sparse_moe.input_linear.weight'):
|
||||
@ -329,8 +330,8 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
device=device),
|
||||
})
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["lm_head."]
|
||||
|
||||
@ -21,7 +21,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only Grok1 model."""
|
||||
from typing import Iterable, List, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
@ -263,7 +264,7 @@ class Grok1DecoderLayer(nn.Module):
|
||||
kv_cache: torch.Tensor,
|
||||
attn_metadata: AttentionMetadata,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
@ -340,7 +341,7 @@ class Grok1Model(nn.Module):
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[torch.Tensor],
|
||||
kv_caches: list[torch.Tensor],
|
||||
attn_metadata: AttentionMetadata,
|
||||
intermediate_tensors: Optional[IntermediateTensors],
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
@ -371,8 +372,8 @@ class Grok1Model(nn.Module):
|
||||
hidden_states, _ = self.norm(hidden_states, residual)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -390,7 +391,7 @@ class Grok1Model(nn.Module):
|
||||
num_experts=num_experts)
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
|
||||
for name, loaded_weight in weights:
|
||||
if (self.quant_config is not None and
|
||||
@ -528,7 +529,7 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[torch.Tensor],
|
||||
kv_caches: list[torch.Tensor],
|
||||
attn_metadata: AttentionMetadata,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
@ -547,8 +548,8 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
skip_prefixes = ["rotary_emb.inv_freq"]
|
||||
# Skip lm_head when tie_word_embeddings is True
|
||||
if self.config.tie_word_embeddings:
|
||||
|
||||
@ -17,7 +17,8 @@
|
||||
# limitations under the License.
|
||||
"""PyTorch Idefics2 model."""
|
||||
|
||||
from typing import Iterable, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -342,8 +343,8 @@ class Idefics2VisionTransformer(nn.Module):
|
||||
last_hidden_state = self.post_layernorm(encoder_outputs)
|
||||
return last_hidden_state
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -351,7 +352,7 @@ class Idefics2VisionTransformer(nn.Module):
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
layer_count = len(self.encoder.layers)
|
||||
|
||||
for name, loaded_weight in weights:
|
||||
|
||||
@ -17,7 +17,7 @@
|
||||
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Dict, Literal, Optional, Set, Tuple, TypedDict, Union
|
||||
from typing import Literal, Optional, TypedDict, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -85,7 +85,7 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
|
||||
def get_hf_processor(
|
||||
self,
|
||||
*,
|
||||
size: Optional[Dict[str, int]] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
**kwargs: object,
|
||||
) -> Idefics3Processor:
|
||||
if size is not None:
|
||||
@ -752,8 +752,8 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional,
|
||||
Protocol, Type, Union, overload, runtime_checkable)
|
||||
from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
|
||||
Union, overload, runtime_checkable)
|
||||
|
||||
import torch
|
||||
from torch import Tensor
|
||||
@ -102,7 +102,7 @@ class _SupportsMultiModalType(Protocol):
|
||||
|
||||
@overload
|
||||
def supports_multimodal(
|
||||
model: Type[object]) -> TypeIs[Type[SupportsMultiModal]]:
|
||||
model: type[object]) -> TypeIs[type[SupportsMultiModal]]:
|
||||
...
|
||||
|
||||
|
||||
@ -112,8 +112,8 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]:
|
||||
|
||||
|
||||
def supports_multimodal(
|
||||
model: Union[Type[object], object],
|
||||
) -> Union[TypeIs[Type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]:
|
||||
model: Union[type[object], object],
|
||||
) -> Union[TypeIs[type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]:
|
||||
if isinstance(model, type):
|
||||
return isinstance(model, _SupportsMultiModalType)
|
||||
|
||||
@ -134,9 +134,9 @@ class SupportsLoRA(Protocol):
|
||||
"""
|
||||
# The `embedding_module` and `embedding_padding_modules`
|
||||
# are empty by default.
|
||||
embedding_modules: ClassVar[Dict[str, str]] = {}
|
||||
embedding_padding_modules: ClassVar[List[str]] = []
|
||||
packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {}
|
||||
embedding_modules: ClassVar[dict[str, str]] = {}
|
||||
embedding_padding_modules: ClassVar[list[str]] = []
|
||||
packed_modules_mapping: ClassVar[dict[str, list[str]]] = {}
|
||||
|
||||
|
||||
# We can't use runtime_checkable with ClassVar for issubclass checks
|
||||
@ -145,13 +145,13 @@ class SupportsLoRA(Protocol):
|
||||
class _SupportsLoRAType(Protocol):
|
||||
supports_lora: Literal[True]
|
||||
|
||||
packed_modules_mapping: Dict[str, List[str]]
|
||||
embedding_modules: Dict[str, str]
|
||||
embedding_padding_modules: List[str]
|
||||
packed_modules_mapping: dict[str, list[str]]
|
||||
embedding_modules: dict[str, str]
|
||||
embedding_padding_modules: list[str]
|
||||
|
||||
|
||||
@overload
|
||||
def supports_lora(model: Type[object]) -> TypeIs[Type[SupportsLoRA]]:
|
||||
def supports_lora(model: type[object]) -> TypeIs[type[SupportsLoRA]]:
|
||||
...
|
||||
|
||||
|
||||
@ -161,8 +161,8 @@ def supports_lora(model: object) -> TypeIs[SupportsLoRA]:
|
||||
|
||||
|
||||
def supports_lora(
|
||||
model: Union[Type[object], object],
|
||||
) -> Union[TypeIs[Type[SupportsLoRA]], TypeIs[SupportsLoRA]]:
|
||||
model: Union[type[object], object],
|
||||
) -> Union[TypeIs[type[SupportsLoRA]], TypeIs[SupportsLoRA]]:
|
||||
result = _supports_lora(model)
|
||||
|
||||
if not result:
|
||||
@ -191,7 +191,7 @@ def supports_lora(
|
||||
return result
|
||||
|
||||
|
||||
def _supports_lora(model: Union[Type[object], object]) -> bool:
|
||||
def _supports_lora(model: Union[type[object], object]) -> bool:
|
||||
if isinstance(model, type):
|
||||
return isinstance(model, _SupportsLoRAType)
|
||||
|
||||
@ -256,7 +256,7 @@ class _SupportsPPType(Protocol):
|
||||
|
||||
|
||||
@overload
|
||||
def supports_pp(model: Type[object]) -> TypeIs[Type[SupportsPP]]:
|
||||
def supports_pp(model: type[object]) -> TypeIs[type[SupportsPP]]:
|
||||
...
|
||||
|
||||
|
||||
@ -266,8 +266,8 @@ def supports_pp(model: object) -> TypeIs[SupportsPP]:
|
||||
|
||||
|
||||
def supports_pp(
|
||||
model: Union[Type[object], object],
|
||||
) -> Union[bool, TypeIs[Type[SupportsPP]], TypeIs[SupportsPP]]:
|
||||
model: Union[type[object], object],
|
||||
) -> Union[bool, TypeIs[type[SupportsPP]], TypeIs[SupportsPP]]:
|
||||
supports_attributes = _supports_pp_attributes(model)
|
||||
supports_inspect = _supports_pp_inspect(model)
|
||||
|
||||
@ -298,14 +298,14 @@ def supports_pp(
|
||||
return supports_attributes and supports_inspect
|
||||
|
||||
|
||||
def _supports_pp_attributes(model: Union[Type[object], object]) -> bool:
|
||||
def _supports_pp_attributes(model: Union[type[object], object]) -> bool:
|
||||
if isinstance(model, type):
|
||||
return isinstance(model, _SupportsPPType)
|
||||
|
||||
return isinstance(model, SupportsPP)
|
||||
|
||||
|
||||
def _supports_pp_inspect(model: Union[Type[object], object]) -> bool:
|
||||
def _supports_pp_inspect(model: Union[type[object], object]) -> bool:
|
||||
model_forward = getattr(model, "forward", None)
|
||||
if not callable(model_forward):
|
||||
return False
|
||||
@ -336,13 +336,13 @@ def has_inner_state(model: object) -> TypeIs[HasInnerState]:
|
||||
|
||||
|
||||
@overload
|
||||
def has_inner_state(model: Type[object]) -> TypeIs[Type[HasInnerState]]:
|
||||
def has_inner_state(model: type[object]) -> TypeIs[type[HasInnerState]]:
|
||||
...
|
||||
|
||||
|
||||
def has_inner_state(
|
||||
model: Union[Type[object], object]
|
||||
) -> Union[TypeIs[Type[HasInnerState]], TypeIs[HasInnerState]]:
|
||||
model: Union[type[object], object]
|
||||
) -> Union[TypeIs[type[HasInnerState]], TypeIs[HasInnerState]]:
|
||||
if isinstance(model, type):
|
||||
return isinstance(model, _HasInnerStateType)
|
||||
|
||||
@ -373,13 +373,13 @@ def is_attention_free(model: object) -> TypeIs[IsAttentionFree]:
|
||||
|
||||
|
||||
@overload
|
||||
def is_attention_free(model: Type[object]) -> TypeIs[Type[IsAttentionFree]]:
|
||||
def is_attention_free(model: type[object]) -> TypeIs[type[IsAttentionFree]]:
|
||||
...
|
||||
|
||||
|
||||
def is_attention_free(
|
||||
model: Union[Type[object], object]
|
||||
) -> Union[TypeIs[Type[IsAttentionFree]], TypeIs[IsAttentionFree]]:
|
||||
model: Union[type[object], object]
|
||||
) -> Union[TypeIs[type[IsAttentionFree]], TypeIs[IsAttentionFree]]:
|
||||
if isinstance(model, type):
|
||||
return isinstance(model, _IsAttentionFreeType)
|
||||
|
||||
@ -410,13 +410,13 @@ def is_hybrid(model: object) -> TypeIs[IsHybrid]:
|
||||
|
||||
|
||||
@overload
|
||||
def is_hybrid(model: Type[object]) -> TypeIs[Type[IsHybrid]]:
|
||||
def is_hybrid(model: type[object]) -> TypeIs[type[IsHybrid]]:
|
||||
...
|
||||
|
||||
|
||||
def is_hybrid(
|
||||
model: Union[Type[object], object]
|
||||
) -> Union[TypeIs[Type[IsHybrid]], TypeIs[IsHybrid]]:
|
||||
model: Union[type[object], object]
|
||||
) -> Union[TypeIs[type[IsHybrid]], TypeIs[IsHybrid]]:
|
||||
if isinstance(model, type):
|
||||
return isinstance(model, _IsHybridType)
|
||||
|
||||
@ -439,13 +439,13 @@ def has_noops(model: object) -> TypeIs[HasNoOps]:
|
||||
|
||||
|
||||
@overload
|
||||
def has_noops(model: Type[object]) -> TypeIs[Type[HasNoOps]]:
|
||||
def has_noops(model: type[object]) -> TypeIs[type[HasNoOps]]:
|
||||
...
|
||||
|
||||
|
||||
def has_noops(
|
||||
model: Union[Type[object], object]
|
||||
) -> Union[TypeIs[Type[HasNoOps]], TypeIs[HasNoOps]]:
|
||||
model: Union[type[object], object]
|
||||
) -> Union[TypeIs[type[HasNoOps]], TypeIs[HasNoOps]]:
|
||||
if isinstance(model, type):
|
||||
return isinstance(model, _HasNoOpsType)
|
||||
|
||||
@ -461,7 +461,7 @@ class SupportsCrossEncoding(Protocol):
|
||||
|
||||
@overload
|
||||
def supports_cross_encoding(
|
||||
model: Type[object]) -> TypeIs[Type[SupportsCrossEncoding]]:
|
||||
model: type[object]) -> TypeIs[type[SupportsCrossEncoding]]:
|
||||
...
|
||||
|
||||
|
||||
@ -471,8 +471,8 @@ def supports_cross_encoding(model: object) -> TypeIs[SupportsCrossEncoding]:
|
||||
|
||||
|
||||
def _supports_cross_encoding(
|
||||
model: Union[Type[object], object],
|
||||
) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
|
||||
model: Union[type[object], object],
|
||||
) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
|
||||
|
||||
if isinstance(model, type):
|
||||
return isinstance(model, SupportsCrossEncoding)
|
||||
@ -481,15 +481,15 @@ def _supports_cross_encoding(
|
||||
|
||||
|
||||
def supports_cross_encoding(
|
||||
model: Union[Type[object], object],
|
||||
) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
|
||||
model: Union[type[object], object],
|
||||
) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
|
||||
return is_pooling_model(model) and _supports_cross_encoding(model)
|
||||
|
||||
|
||||
class SupportsQuant:
|
||||
"""The interface required for all models that support quantization."""
|
||||
|
||||
packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {}
|
||||
packed_modules_mapping: ClassVar[dict[str, list[str]]] = {}
|
||||
quant_config: Optional[QuantizationConfig] = None
|
||||
|
||||
def __new__(cls, *args, **kwargs) -> Self:
|
||||
@ -525,7 +525,7 @@ class SupportsTranscription(Protocol):
|
||||
|
||||
@overload
|
||||
def supports_transcription(
|
||||
model: Type[object]) -> TypeIs[Type[SupportsTranscription]]:
|
||||
model: type[object]) -> TypeIs[type[SupportsTranscription]]:
|
||||
...
|
||||
|
||||
|
||||
@ -535,8 +535,8 @@ def supports_transcription(model: object) -> TypeIs[SupportsTranscription]:
|
||||
|
||||
|
||||
def supports_transcription(
|
||||
model: Union[Type[object], object],
|
||||
) -> Union[TypeIs[Type[SupportsTranscription]], TypeIs[SupportsTranscription]]:
|
||||
model: Union[type[object], object],
|
||||
) -> Union[TypeIs[type[SupportsTranscription]], TypeIs[SupportsTranscription]]:
|
||||
if isinstance(model, type):
|
||||
return isinstance(model, SupportsTranscription)
|
||||
|
||||
@ -551,7 +551,7 @@ class SupportsV0Only(Protocol):
|
||||
|
||||
|
||||
@overload
|
||||
def supports_v0_only(model: Type[object]) -> TypeIs[Type[SupportsV0Only]]:
|
||||
def supports_v0_only(model: type[object]) -> TypeIs[type[SupportsV0Only]]:
|
||||
...
|
||||
|
||||
|
||||
@ -561,8 +561,8 @@ def supports_v0_only(model: object) -> TypeIs[SupportsV0Only]:
|
||||
|
||||
|
||||
def supports_v0_only(
|
||||
model: Union[Type[object], object],
|
||||
) -> Union[TypeIs[Type[SupportsV0Only]], TypeIs[SupportsV0Only]]:
|
||||
model: Union[type[object], object],
|
||||
) -> Union[TypeIs[type[SupportsV0Only]], TypeIs[SupportsV0Only]]:
|
||||
if isinstance(model, type):
|
||||
return isinstance(model, SupportsV0Only)
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import (TYPE_CHECKING, Optional, Protocol, Type, Union, overload,
|
||||
from typing import (TYPE_CHECKING, Optional, Protocol, Union, overload,
|
||||
runtime_checkable)
|
||||
|
||||
import torch
|
||||
@ -20,7 +20,7 @@ logger = init_logger(__name__)
|
||||
|
||||
# The type of hidden states
|
||||
# Currently, T = torch.Tensor for all models except for Medusa
|
||||
# which has T = List[torch.Tensor]
|
||||
# which has T = list[torch.Tensor]
|
||||
T = TypeVar("T", default=torch.Tensor)
|
||||
T_co = TypeVar("T_co", default=torch.Tensor, covariant=True)
|
||||
|
||||
@ -48,12 +48,12 @@ class VllmModel(Protocol[T_co]):
|
||||
...
|
||||
|
||||
|
||||
def _check_vllm_model_init(model: Union[Type[object], object]) -> bool:
|
||||
def _check_vllm_model_init(model: Union[type[object], object]) -> bool:
|
||||
model_init = model.__init__
|
||||
return supports_kw(model_init, "vllm_config")
|
||||
|
||||
|
||||
def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool:
|
||||
def _check_vllm_model_forward(model: Union[type[object], object]) -> bool:
|
||||
model_forward = getattr(model, "forward", None)
|
||||
if not callable(model_forward):
|
||||
return False
|
||||
@ -75,7 +75,7 @@ def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool:
|
||||
|
||||
|
||||
@overload
|
||||
def is_vllm_model(model: Type[object]) -> TypeIs[Type[VllmModel]]:
|
||||
def is_vllm_model(model: type[object]) -> TypeIs[type[VllmModel]]:
|
||||
...
|
||||
|
||||
|
||||
@ -85,8 +85,8 @@ def is_vllm_model(model: object) -> TypeIs[VllmModel]:
|
||||
|
||||
|
||||
def is_vllm_model(
|
||||
model: Union[Type[object], object],
|
||||
) -> Union[TypeIs[Type[VllmModel]], TypeIs[VllmModel]]:
|
||||
model: Union[type[object], object],
|
||||
) -> Union[TypeIs[type[VllmModel]], TypeIs[VllmModel]]:
|
||||
return _check_vllm_model_init(model) and _check_vllm_model_forward(model)
|
||||
|
||||
|
||||
@ -105,7 +105,7 @@ class VllmModelForTextGeneration(VllmModel[T], Protocol[T]):
|
||||
|
||||
@overload
|
||||
def is_text_generation_model(
|
||||
model: Type[object]) -> TypeIs[Type[VllmModelForTextGeneration]]:
|
||||
model: type[object]) -> TypeIs[type[VllmModelForTextGeneration]]:
|
||||
...
|
||||
|
||||
|
||||
@ -116,8 +116,8 @@ def is_text_generation_model(
|
||||
|
||||
|
||||
def is_text_generation_model(
|
||||
model: Union[Type[object], object],
|
||||
) -> Union[TypeIs[Type[VllmModelForTextGeneration]],
|
||||
model: Union[type[object], object],
|
||||
) -> Union[TypeIs[type[VllmModelForTextGeneration]],
|
||||
TypeIs[VllmModelForTextGeneration]]:
|
||||
if not is_vllm_model(model):
|
||||
return False
|
||||
@ -142,7 +142,7 @@ class VllmModelForPooling(VllmModel[T], Protocol[T]):
|
||||
|
||||
|
||||
@overload
|
||||
def is_pooling_model(model: Type[object]) -> TypeIs[Type[VllmModelForPooling]]:
|
||||
def is_pooling_model(model: type[object]) -> TypeIs[type[VllmModelForPooling]]:
|
||||
...
|
||||
|
||||
|
||||
@ -152,8 +152,8 @@ def is_pooling_model(model: object) -> TypeIs[VllmModelForPooling]:
|
||||
|
||||
|
||||
def is_pooling_model(
|
||||
model: Union[Type[object], object],
|
||||
) -> Union[TypeIs[Type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]:
|
||||
model: Union[type[object], object],
|
||||
) -> Union[TypeIs[type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]:
|
||||
if not is_vllm_model(model):
|
||||
return False
|
||||
|
||||
|
||||
@ -6,8 +6,9 @@
|
||||
# Copyright (c) 2023 OpenGVLab
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
from collections.abc import Iterable
|
||||
from functools import partial
|
||||
from typing import Iterable, Optional, Set, Tuple
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -461,10 +462,10 @@ class InternVisionModel(nn.Module):
|
||||
|
||||
return encoder_outputs
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from collections.abc import Iterable
|
||||
from functools import partial
|
||||
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Type, Union
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -81,7 +82,7 @@ class InternLM2Attention(nn.Module):
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
@ -225,7 +226,7 @@ class InternLMDecoderLayer(nn.Module):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
@ -252,7 +253,7 @@ class InternLM2Model(nn.Module):
|
||||
*,
|
||||
vllm_config: VllmConfig,
|
||||
prefix: str = "",
|
||||
layer_type: Type[InternLMDecoderLayer] = InternLMDecoderLayer):
|
||||
layer_type: type[InternLMDecoderLayer] = InternLMDecoderLayer):
|
||||
super().__init__()
|
||||
|
||||
config = vllm_config.model_config.hf_config
|
||||
@ -316,7 +317,7 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
|
||||
*,
|
||||
vllm_config: VllmConfig,
|
||||
prefix: str = "",
|
||||
model_type: Type[InternLM2Model] = InternLM2Model):
|
||||
model_type: type[InternLM2Model] = InternLM2Model):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
@ -361,15 +362,15 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("gate_up_proj", "w1", 0),
|
||||
("gate_up_proj", "w3", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
@ -407,7 +408,7 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM):
|
||||
*,
|
||||
vllm_config: VllmConfig,
|
||||
prefix: str = "",
|
||||
model_type: Type[InternLM2Model] = InternLM2Model,
|
||||
model_type: type[InternLM2Model] = InternLM2Model,
|
||||
):
|
||||
super().__init__(vllm_config=vllm_config,
|
||||
prefix=prefix,
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import Optional, Tuple, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -66,7 +66,7 @@ class InternLM2VEDecoderLayer(nn.Module):
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
visual_token_mask: Optional[torch.Tensor] = None,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
# --------------------------------------------------------
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union
|
||||
from typing import Literal, Optional, TypedDict, TypeVar, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -932,8 +932,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
return self.language_model.compute_logits(hidden_states,
|
||||
sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
# unused modules appear in OpenGVLab/InternVideo2_5_Chat_8B
|
||||
skip_prefixes = [
|
||||
"action_embed", "temporal_embed", "track_embed",
|
||||
|
||||
@ -21,7 +21,8 @@
|
||||
"""Inference-only Jais model compatible with HuggingFace weights."""
|
||||
|
||||
import math
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -333,10 +334,10 @@ class JAISLMHeadModel(nn.Module, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "lm_head.weight" in name:
|
||||
# GPT-2 ties the weights of the embedding layer and the final
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Inference-only Jamba model."""
|
||||
from typing import Iterable, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -442,7 +443,7 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
|
||||
return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
|
||||
|
||||
def _get_mamba_cache_shape(
|
||||
self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
|
||||
self) -> tuple[tuple[int, int], tuple[int, int]]:
|
||||
world_size = get_tensor_model_parallel_world_size()
|
||||
hidden_size = self.config.hidden_size
|
||||
conv_state_shape = (
|
||||
@ -464,8 +465,8 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -482,7 +483,7 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
|
||||
num_experts=self.config.num_experts)
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
@ -583,7 +584,7 @@ class JambaForSequenceClassification(JambaForCausalLM):
|
||||
logits = self.score(hidden_states)
|
||||
return self._pooler(logits, pooling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
# TODO: The reward weights themselves have float32 accuracy data, we
|
||||
# would like to load them in fp32 to get that extra precision.
|
||||
super().load_weights(weights)
|
||||
|
||||
@ -43,10 +43,9 @@
|
||||
|
||||
import copy
|
||||
import math
|
||||
from collections.abc import Mapping
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from dataclasses import dataclass
|
||||
from typing import (Any, Iterable, List, Literal, Optional, Sequence, Tuple,
|
||||
TypedDict, Union)
|
||||
from typing import Any, Literal, Optional, TypedDict, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -120,7 +119,7 @@ class KimiVLMultiModalProjector(nn.Module):
|
||||
|
||||
class KimiVLImagePixelInputs(TypedDict):
|
||||
type: Literal["pixel_values"]
|
||||
pixel_values: Union[torch.Tensor, List[torch.Tensor]]
|
||||
pixel_values: Union[torch.Tensor, list[torch.Tensor]]
|
||||
"""
|
||||
Shape:`(num_patches, num_channels, patch_size, patch_size)`
|
||||
"""
|
||||
@ -447,7 +446,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal):
|
||||
sampling_metadata, **kwargs)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
config = self.config.text_config
|
||||
_KEYS_TO_MODIFY_MAPPING = {
|
||||
"language_model.lm_head": "lm_head",
|
||||
|
||||
@ -22,7 +22,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only LLaMA model compatible with HuggingFace weights."""
|
||||
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -103,7 +104,7 @@ class LlamaAttention(nn.Module):
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
bias: bool = False,
|
||||
@ -285,7 +286,7 @@ class LlamaDecoderLayer(nn.Module):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
@ -394,8 +395,8 @@ class LlamaModel(nn.Module):
|
||||
return hidden_states, aux_hidden_states
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
(".qkv_proj", ".q_proj", "q"),
|
||||
@ -405,7 +406,7 @@ class LlamaModel(nn.Module):
|
||||
(".gate_up_proj", ".up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
@ -582,8 +583,8 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["lm_head."]
|
||||
@ -599,7 +600,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
self,
|
||||
name: str,
|
||||
loaded_weight: torch.Tensor,
|
||||
) -> Tuple[str, torch.Tensor]:
|
||||
) -> tuple[str, torch.Tensor]:
|
||||
|
||||
def permute(w: torch.Tensor, n_heads: int):
|
||||
attn_in = self.config.head_dim * n_heads
|
||||
|
||||
@ -16,7 +16,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only LLaMA model compatible with HuggingFace weights."""
|
||||
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -48,7 +49,7 @@ class Llama4MoE(nn.Module):
|
||||
gating_output: torch.Tensor,
|
||||
topk: int,
|
||||
renormalize: bool,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
router_scores, router_indices = fast_topk(gating_output, topk, dim=-1)
|
||||
# psuedo-standard is that the router scores are floats
|
||||
router_scores = torch.sigmoid(router_scores.float())
|
||||
@ -115,7 +116,7 @@ class Llama4Attention(nn.Module):
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
bias: bool = False,
|
||||
@ -300,7 +301,7 @@ class Llama4DecoderLayer(nn.Module):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
@ -335,9 +336,9 @@ class Llama4Model(LlamaModel):
|
||||
self,
|
||||
name: str,
|
||||
loaded_weight: torch.Tensor,
|
||||
params_dict: Dict[str, nn.Parameter],
|
||||
loaded_params: Set[str],
|
||||
expert_params_mapping: List[Tuple[str, str, int, str]],
|
||||
params_dict: dict[str, nn.Parameter],
|
||||
loaded_params: set[str],
|
||||
expert_params_mapping: list[tuple[str, str, int, str]],
|
||||
fused: bool = True,
|
||||
) -> bool:
|
||||
expert_param_loaded = False
|
||||
@ -390,8 +391,8 @@ class Llama4Model(LlamaModel):
|
||||
expert_param_loaded = True
|
||||
return expert_param_loaded
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
(".qkv_proj", ".q_proj", "q"),
|
||||
@ -412,7 +413,7 @@ class Llama4Model(LlamaModel):
|
||||
ckpt_up_proj_name="gate_up_proj",
|
||||
num_experts=1)
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "experts.gate_up_proj" in name or "experts.down_proj" in name:
|
||||
fused_experts_params = True
|
||||
@ -489,8 +490,8 @@ class Llama4ForCausalLM(LlamaForCausalLM):
|
||||
prefix=prefix,
|
||||
layer_type=layer_type)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["lm_head."]
|
||||
@ -506,7 +507,7 @@ class Llama4ForCausalLM(LlamaForCausalLM):
|
||||
self,
|
||||
name: str,
|
||||
loaded_weight: torch.Tensor,
|
||||
) -> Tuple[str, torch.Tensor]:
|
||||
) -> tuple[str, torch.Tensor]:
|
||||
|
||||
def permute(w: torch.Tensor, n_heads: int):
|
||||
attn_in = self.config.head_dim * n_heads
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import Iterable, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -92,8 +92,8 @@ class LlamaModel(nn.Module):
|
||||
hidden_states = hidden_states + residual
|
||||
return hidden_states, hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
(".qkv_proj", ".q_proj", "q"),
|
||||
@ -103,7 +103,7 @@ class LlamaModel(nn.Module):
|
||||
(".gate_up_proj", ".up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
for param_name, weight_name, shard_id in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
@ -150,7 +150,7 @@ class EagleLlamaForCausalLM(LlamaForCausalLM):
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
return self.model(input_ids, positions, hidden_states)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=None,
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import Iterable, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -56,7 +57,7 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
|
||||
embeds: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
|
||||
residual = hidden_states
|
||||
embeds = self.input_layernorm(embeds)
|
||||
@ -140,8 +141,8 @@ class LlamaModel(nn.Module):
|
||||
hidden_states, hidden_prenorm = self.norm(hidden_states, residual)
|
||||
return hidden_states, hidden_prenorm
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
(".qkv_proj", ".q_proj", "q"),
|
||||
@ -151,7 +152,7 @@ class LlamaModel(nn.Module):
|
||||
(".gate_up_proj", ".up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if 'midlayer.' in name:
|
||||
name = name.replace('midlayer.', 'layers.0.')
|
||||
@ -228,7 +229,7 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
|
||||
# combine multiple auxiliary hidden states returned by eagle3
|
||||
return self.model.fc(hidden_states)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=None,
|
||||
|
||||
@ -2,8 +2,8 @@
|
||||
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict,
|
||||
TypeVar, Union, cast)
|
||||
from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
|
||||
Union, cast)
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -751,8 +751,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
return self.language_model.compute_logits(hidden_states,
|
||||
sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
|
||||
@ -1,8 +1,9 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from abc import abstractmethod
|
||||
from typing import (Final, Iterable, List, Literal, Mapping, Optional,
|
||||
Protocol, Set, Tuple, TypedDict, TypeVar, Union)
|
||||
from collections.abc import Iterable, Mapping
|
||||
from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
|
||||
Union)
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -266,8 +267,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
return data
|
||||
|
||||
def _validate_pixel_values(
|
||||
self, data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, List[torch.Tensor]]:
|
||||
self, data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
|
||||
h = w = self.config.vision_config.image_size
|
||||
expected_dims = (3, h, w)
|
||||
@ -450,7 +451,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
def _process_image_input(
|
||||
self,
|
||||
image_input: LlavaNextImageInputs,
|
||||
) -> Union[torch.Tensor, List[torch.Tensor]]:
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
if image_input["type"] == "image_embeds":
|
||||
return [image_input["data"]]
|
||||
|
||||
@ -577,7 +578,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
return self.language_model.compute_logits(hidden_states,
|
||||
sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
|
||||
from typing import Literal, Optional, TypedDict, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -35,7 +35,7 @@ from .vision import get_vision_encoder_info
|
||||
|
||||
class LlavaNextVideoPixelInputs(TypedDict):
|
||||
type: Literal["pixel_values_videos"]
|
||||
data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
"""
|
||||
Shape: `(batch_size, num_frames, num_channels, height, width)`
|
||||
|
||||
@ -300,8 +300,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
self.language_model.model.make_empty_intermediate_tensors)
|
||||
|
||||
def _validate_video_pixel_values(
|
||||
self, data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, List[torch.Tensor]]:
|
||||
self, data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
|
||||
h = w = self.config.vision_config.image_size
|
||||
expected_dims = (3, h, w)
|
||||
@ -326,7 +326,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
A legal video input should have the following dimensions:
|
||||
{
|
||||
"pixel_values_videos" :
|
||||
List[b, Tensor(nb_frames, nb_channels, height, width)]
|
||||
list[b, Tensor(nb_frames, nb_channels, height, width)]
|
||||
}
|
||||
"""
|
||||
pixel_values_videos = kwargs.pop("pixel_values_videos", None)
|
||||
@ -460,8 +460,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
return self.language_model.compute_logits(hidden_states,
|
||||
sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
# This model doesn't support images for now
|
||||
|
||||
@ -2,8 +2,7 @@
|
||||
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import (Final, List, Literal, Optional, Protocol, Set, Tuple,
|
||||
TypedDict, Union)
|
||||
from typing import Final, Literal, Optional, Protocol, TypedDict, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -471,8 +470,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
return data
|
||||
|
||||
def _validate_image_pixel_values(
|
||||
self, data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, List[torch.Tensor]]:
|
||||
self, data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
|
||||
h = w = self.config.vision_config.image_size
|
||||
expected_dims = (3, h, w)
|
||||
@ -530,8 +529,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
raise AssertionError("This line should be unreachable.")
|
||||
|
||||
def _validate_video_pixel_values(
|
||||
self, data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, List[torch.Tensor]]:
|
||||
self, data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
|
||||
h = w = self.config.vision_config.image_size
|
||||
expected_dims = (3, h, w)
|
||||
@ -557,7 +556,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
A legal video input should have the following dimensions:
|
||||
{
|
||||
"pixel_values_videos" :
|
||||
List[b, Tensor(nb_frames, nb_channels, height, width)]
|
||||
list[b, Tensor(nb_frames, nb_channels, height, width)]
|
||||
}
|
||||
"""
|
||||
pixel_values_videos = kwargs.pop("pixel_values_videos", None)
|
||||
@ -706,7 +705,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
def _process_image_pixels(
|
||||
self,
|
||||
inputs: LlavaOnevisionImagePixelInputs,
|
||||
) -> Union[torch.Tensor, List[torch.Tensor]]:
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
assert self.vision_tower is not None
|
||||
|
||||
pixel_values = inputs["pixel_values"]
|
||||
@ -735,7 +734,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
def _process_image_input(
|
||||
self,
|
||||
image_input: LlavaOnevisionImageInputs,
|
||||
) -> Union[torch.Tensor, List[torch.Tensor]]:
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
if image_input["type"] == "image_embeds":
|
||||
return [image_input["data"]]
|
||||
|
||||
@ -948,7 +947,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
return self.language_model.compute_logits(hidden_states,
|
||||
sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""PyTorch MAMBA model."""
|
||||
from typing import Iterable, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -30,7 +31,7 @@ from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
|
||||
make_empty_intermediate_tensors_factory, make_layers,
|
||||
maybe_prefix)
|
||||
|
||||
KVCache = Tuple[torch.Tensor, torch.Tensor]
|
||||
KVCache = tuple[torch.Tensor, torch.Tensor]
|
||||
|
||||
|
||||
class MambaDecoderLayer(nn.Module):
|
||||
@ -153,10 +154,10 @@ class MambaModel(nn.Module):
|
||||
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "A_log" in name:
|
||||
name = name.replace("A_log", "A")
|
||||
@ -247,7 +248,7 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP,
|
||||
return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
|
||||
|
||||
def _get_mamba_cache_shape(
|
||||
self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
|
||||
self) -> tuple[tuple[int, int], tuple[int, int]]:
|
||||
world_size = get_tensor_model_parallel_world_size()
|
||||
conv_state_shape = (
|
||||
self.config.intermediate_size // world_size,
|
||||
@ -265,7 +266,7 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP,
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""PyTorch MAMBA2 model."""
|
||||
from typing import Iterable, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -35,7 +36,7 @@ from .utils import (is_pp_missing_parameter,
|
||||
make_empty_intermediate_tensors_factory, make_layers,
|
||||
maybe_prefix)
|
||||
|
||||
KVCache = Tuple[torch.Tensor, torch.Tensor]
|
||||
KVCache = tuple[torch.Tensor, torch.Tensor]
|
||||
|
||||
|
||||
class Mamba2DecoderLayer(nn.Module):
|
||||
@ -241,7 +242,7 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree,
|
||||
return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
|
||||
|
||||
def _get_mamba_cache_shape(
|
||||
self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
|
||||
self) -> tuple[tuple[int, int], tuple[int, int]]:
|
||||
world_size = get_tensor_model_parallel_world_size()
|
||||
|
||||
conv_state_shape, temporal_state_shape = None, None
|
||||
@ -279,10 +280,10 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree,
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "A_log" in name:
|
||||
name = name.replace("A_log", "A")
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Tuple
|
||||
|
||||
import torch
|
||||
|
||||
@ -25,8 +24,8 @@ class MambaCacheParams:
|
||||
class MambaCacheManager(ConstantSizeCache):
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype,
|
||||
num_mamba_layers: int, conv_state_shape: Tuple[int, int],
|
||||
temporal_state_shape: Tuple[int, int]):
|
||||
num_mamba_layers: int, conv_state_shape: tuple[int, int],
|
||||
temporal_state_shape: tuple[int, int]):
|
||||
|
||||
# Determine max batch size to set size of MambaCache
|
||||
max_batch_size = vllm_config.scheduler_config.max_num_seqs
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import Iterable, List, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -96,13 +97,13 @@ class Medusa(nn.Module):
|
||||
# checkpoint file has token_map tensor.
|
||||
self.token_map = None
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> List[torch.Tensor]:
|
||||
def forward(self, hidden_states: torch.Tensor) -> list[torch.Tensor]:
|
||||
return [block(hidden_states) for block in self.blocks]
|
||||
|
||||
def compute_logits(
|
||||
self, hidden_states: List[torch.Tensor],
|
||||
sampling_metadata: SamplingMetadata) -> List[torch.Tensor]:
|
||||
logits_lst: List[torch.Tensor] = []
|
||||
self, hidden_states: list[torch.Tensor],
|
||||
sampling_metadata: SamplingMetadata) -> list[torch.Tensor]:
|
||||
logits_lst: list[torch.Tensor] = []
|
||||
|
||||
for hs, lm_head in zip(hidden_states, self.lm_heads):
|
||||
_logits = self.logits_processor(lm_head, hs, sampling_metadata)
|
||||
@ -127,9 +128,9 @@ class Medusa(nn.Module):
|
||||
|
||||
def sample(
|
||||
self,
|
||||
logits: List[torch.Tensor],
|
||||
logits: list[torch.Tensor],
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> List[SamplerOutput]:
|
||||
) -> list[SamplerOutput]:
|
||||
logits = torch.stack(logits, dim=0).float()
|
||||
logprobs = torch.log_softmax(logits, dim=-1)
|
||||
token_ids = logits.argmax(-1) # support only top-1 for now
|
||||
@ -144,7 +145,7 @@ class Medusa(nn.Module):
|
||||
token_prob_list.append(probs[:, seq_group.sample_indices])
|
||||
token_logprob_list.append(logprobs[:, seq_group.sample_indices])
|
||||
|
||||
outputs: List[Optional[SamplerOutput]] = []
|
||||
outputs: list[Optional[SamplerOutput]] = []
|
||||
for idx in range(len(sampling_metadata.seq_groups)):
|
||||
outputs.append(
|
||||
SamplerOutput(
|
||||
@ -160,7 +161,7 @@ class Medusa(nn.Module):
|
||||
self,
|
||||
previous_hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> List[SamplerOutput]:
|
||||
) -> list[SamplerOutput]:
|
||||
return self.sample(
|
||||
logits=self.compute_logits(
|
||||
hidden_states=self.forward(previous_hidden_states),
|
||||
@ -169,10 +170,10 @@ class Medusa(nn.Module):
|
||||
sampling_metadata=sampling_metadata,
|
||||
)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
|
||||
weights_map = {}
|
||||
|
||||
|
||||
@ -24,7 +24,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only MiMo model compatible with HuggingFace weights."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -87,8 +88,8 @@ class MiMoModel(Qwen2Model):
|
||||
hidden_states = hidden_states + residual
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
@ -97,7 +98,7 @@ class MiMoModel(Qwen2Model):
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "mtp_layers" in name:
|
||||
continue
|
||||
|
||||
@ -18,7 +18,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only MiMo-MTP model."""
|
||||
from typing import Iterable, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -193,8 +194,8 @@ class MiMoMTP(nn.Module):
|
||||
next_tokens = self.sampler(logits, sampling_metadata)
|
||||
return next_tokens
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
@ -204,7 +205,7 @@ class MiMoMTP(nn.Module):
|
||||
]
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
|
||||
@ -23,7 +23,8 @@
|
||||
# limitations under the License.
|
||||
"""Inference-only MiniCPM model compatible with HuggingFace weights."""
|
||||
import math
|
||||
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -190,7 +191,7 @@ class MiniCPMAttention(nn.Module):
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
@ -329,7 +330,7 @@ class MiniCPMDecoderLayer(nn.Module):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
@ -428,8 +429,8 @@ class MiniCPMModel(nn.Module):
|
||||
hidden_states = self.norm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -446,7 +447,7 @@ class MiniCPMModel(nn.Module):
|
||||
for weight_name in ["w1", "w2", "w3"]
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
@ -582,8 +583,8 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["lm_head."]
|
||||
|
||||
@ -23,7 +23,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only MiniCPM3 model compatible with HuggingFace weights."""
|
||||
from typing import Any, Dict, Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -58,7 +58,7 @@ class MiniCPM3Attention(nn.Module):
|
||||
q_lora_rank: int,
|
||||
kv_lora_rank: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
|
||||
@ -23,8 +23,7 @@
|
||||
# limitations under the License.
|
||||
"""Inference-only MiniCPM-O model compatible with HuggingFace weights."""
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict,
|
||||
Union)
|
||||
from typing import Any, Callable, Literal, Optional, TypedDict, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -559,8 +558,8 @@ class MiniCPMO(MiniCPMV2_6):
|
||||
self.audio_encoder_layer = -1
|
||||
return model
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self, skip_prefixes=["tts"])
|
||||
return loader.load_weights(weights)
|
||||
|
||||
|
||||
@ -26,8 +26,7 @@ import math
|
||||
from collections import defaultdict
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from functools import partial
|
||||
from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict,
|
||||
Union)
|
||||
from typing import Any, Callable, Literal, Optional, TypedDict, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -118,7 +117,7 @@ class Resampler2_5(BaseResampler):
|
||||
num_heads: int,
|
||||
kv_dim: Optional[int] = None,
|
||||
norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
|
||||
max_size: Tuple[int, int] = (70, 70),
|
||||
max_size: tuple[int, int] = (70, 70),
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "") -> None:
|
||||
super().__init__(num_queries,
|
||||
@ -133,7 +132,7 @@ class Resampler2_5(BaseResampler):
|
||||
self._set_2d_pos_cache(self.max_size)
|
||||
|
||||
def _set_2d_pos_cache(self,
|
||||
max_size: Tuple[int, int],
|
||||
max_size: tuple[int, int],
|
||||
device: torch.types.Device = "cpu") -> None:
|
||||
pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim,
|
||||
max_size,
|
||||
@ -203,7 +202,7 @@ class Resampler2_5(BaseResampler):
|
||||
return x
|
||||
|
||||
|
||||
def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
|
||||
def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]:
|
||||
version_float = getattr(config, "version", None)
|
||||
|
||||
# The old configs do not include version number
|
||||
@ -938,8 +937,8 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
) -> Optional[torch.Tensor]:
|
||||
return self.llm.compute_logits(hidden_states, sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
|
||||
@ -3,7 +3,8 @@
|
||||
import copy
|
||||
import math
|
||||
import re
|
||||
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.distributed
|
||||
@ -127,7 +128,7 @@ class MiniMaxText01RMSNormTP(CustomOp):
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
residual: Optional[torch.Tensor] = None,
|
||||
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
||||
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
|
||||
assert residual is None, "RMSNorm does not support residual connection."
|
||||
return self._forward(x)
|
||||
|
||||
@ -178,7 +179,7 @@ class MiniMaxText01RotaryEmbedding(CustomOp):
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
from vllm import _custom_ops as ops
|
||||
self.cos_sin_cache = self.cos_sin_cache.to(positions.device)
|
||||
query_cast = query.to(self.cache_dtype)
|
||||
@ -708,11 +709,11 @@ class MiniMaxText01DecoderLayer(nn.Module):
|
||||
def forward(self,
|
||||
hidden_states: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: Union[List[Dict], Optional[torch.Tensor]],
|
||||
kv_caches: Union[list[dict], Optional[torch.Tensor]],
|
||||
attn_metadata: AttentionMetadata,
|
||||
residual: Optional[torch.Tensor],
|
||||
is_warmup: bool = False,
|
||||
**kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
**kwargs) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
|
||||
forward_context = get_forward_context()
|
||||
attn_metadata = forward_context.attn_metadata
|
||||
@ -1072,10 +1073,10 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid,
|
||||
device=device),
|
||||
})
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
|
||||
def which_layer(name: str) -> int:
|
||||
if "layers" in name:
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from collections.abc import Iterable, Mapping
|
||||
from typing import Literal, Optional, Set, Tuple, TypedDict, Union, cast
|
||||
from typing import Literal, Optional, TypedDict, Union, cast
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -357,7 +357,7 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
return self.language_model.compute_logits(hidden_states,
|
||||
sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -2,8 +2,8 @@
|
||||
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict,
|
||||
TypeVar, Union)
|
||||
from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar,
|
||||
Union)
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -589,8 +589,8 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
|
||||
return self.language_model.compute_logits(hidden_states,
|
||||
sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
|
||||
@ -22,7 +22,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only Mixtral model."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -314,8 +315,8 @@ class MixtralModel(nn.Module):
|
||||
hidden_states, _ = self.norm(hidden_states, residual)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -332,7 +333,7 @@ class MixtralModel(nn.Module):
|
||||
num_experts=self.config.num_local_experts)
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if (self.quant_config is not None and
|
||||
(scale_name := self.quant_config.get_cache_scale(name))):
|
||||
@ -479,7 +480,7 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"])
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -22,7 +22,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only Mixtral model."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -397,8 +398,8 @@ class MixtralForCausalLM(nn.Module, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -407,7 +408,7 @@ class MixtralForCausalLM(nn.Module, SupportsPP):
|
||||
]
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
"""PyTorch Mllama model."""
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
|
||||
from typing import Literal, Optional, TypedDict, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -224,7 +224,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
|
||||
|
||||
return mm_inputs
|
||||
|
||||
def _get_num_image_in_last_group(self, prompt_token_ids: List[int]) -> int:
|
||||
def _get_num_image_in_last_group(self, prompt_token_ids: list[int]) -> int:
|
||||
num_images = 0
|
||||
for token_id in prompt_token_ids[::-1]:
|
||||
if token_id == self.info.get_hf_config().image_token_index:
|
||||
@ -370,8 +370,8 @@ class ColumnParallelConv2dPatch(torch.nn.Module):
|
||||
self,
|
||||
in_channels: int,
|
||||
out_channels: int,
|
||||
kernel_size: Union[int, Tuple[int, int]],
|
||||
stride: Union[int, Tuple[int, int]],
|
||||
kernel_size: Union[int, tuple[int, int]],
|
||||
stride: Union[int, tuple[int, int]],
|
||||
bias: bool = False,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
@ -603,7 +603,7 @@ class MllamaVisionEncoder(nn.Module):
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
) -> Union[Tuple, BaseModelOutput]:
|
||||
) -> Union[BaseModelOutput]:
|
||||
encoder_states = ()
|
||||
|
||||
for i, encoder_layer in enumerate(self.layers):
|
||||
@ -878,7 +878,7 @@ class MllamaTextCrossAttention(nn.Module):
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor],
|
||||
kv_range_for_decode: Optional[List[Tuple[int, int]]],
|
||||
kv_range_for_decode: Optional[list[tuple[int, int]]],
|
||||
cross_attention_states: Optional[torch.Tensor],
|
||||
) -> torch.Tensor:
|
||||
q, k, v = self.qkv_proj(hidden_states, cross_attention_states)
|
||||
@ -905,7 +905,7 @@ class MllamaTextCrossAttention(nn.Module):
|
||||
k: torch.Tensor,
|
||||
v: torch.Tensor,
|
||||
attention_mask: torch.Tensor,
|
||||
kv_range_for_decode: List[Tuple[int, int]],
|
||||
kv_range_for_decode: list[tuple[int, int]],
|
||||
) -> torch.Tensor:
|
||||
kv_cache = self.attn.kv_cache[self.pipeline_parallel_rank]
|
||||
attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
|
||||
@ -1019,7 +1019,7 @@ class MllamaCrossAttentionDecoderLayer(torch.nn.Module):
|
||||
hidden_states: torch.Tensor,
|
||||
cross_attention_states: torch.Tensor,
|
||||
cross_attention_mask: torch.Tensor,
|
||||
kv_range_for_decode: Optional[List[Tuple[int, int]]],
|
||||
kv_range_for_decode: Optional[list[tuple[int, int]]],
|
||||
full_text_row_masked_out_mask: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
residual = hidden_states
|
||||
@ -1089,8 +1089,8 @@ class MllamaTextModel(nn.Module):
|
||||
positions: Optional[torch.LongTensor],
|
||||
cross_attention_states: Optional[torch.LongTensor],
|
||||
cross_attention_mask: Optional[torch.LongTensor],
|
||||
kv_range_for_decode: Optional[List[Tuple[int, int]]],
|
||||
full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor,
|
||||
kv_range_for_decode: Optional[list[tuple[int, int]]],
|
||||
full_text_row_masked_out_mask: Optional[tuple[torch.Tensor,
|
||||
torch.Tensor]],
|
||||
skip_cross_attention: bool,
|
||||
) -> torch.Tensor:
|
||||
@ -1150,8 +1150,8 @@ class MllamaForCausalLM(nn.Module):
|
||||
positions: Optional[torch.LongTensor],
|
||||
cross_attention_states: Optional[torch.LongTensor],
|
||||
cross_attention_mask: Optional[torch.LongTensor],
|
||||
kv_range_for_decode: Optional[List[Tuple[int, int]]],
|
||||
full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor,
|
||||
kv_range_for_decode: Optional[list[tuple[int, int]]],
|
||||
full_text_row_masked_out_mask: Optional[tuple[torch.Tensor,
|
||||
torch.Tensor]],
|
||||
skip_cross_attention: bool,
|
||||
) -> torch.Tensor:
|
||||
@ -1221,7 +1221,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
return logits
|
||||
|
||||
def unpack_data(self,
|
||||
image_data: Union[List[torch.Tensor], torch.Tensor],
|
||||
image_data: Union[list[torch.Tensor], torch.Tensor],
|
||||
padding_value=0) -> torch.Tensor:
|
||||
if isinstance(image_data, torch.Tensor):
|
||||
# torch.Tensor
|
||||
@ -1230,7 +1230,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
assert isinstance(
|
||||
image_data[0],
|
||||
torch.Tensor), "Image data is not properly batched."
|
||||
# List[torch.Tensor]
|
||||
# list[torch.Tensor]
|
||||
bsz = len(image_data)
|
||||
max_length = max(t.size(0) for t in image_data)
|
||||
trailing_dims = image_data[0].shape[1:]
|
||||
@ -1248,24 +1248,24 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
def _parse_and_validate_image_input(self, **kwargs: object):
|
||||
# tensor with the same shape will be batched together by
|
||||
# MultiModalKwargs.batch, so pixel_values here can be:
|
||||
# - List[torch.Tensor]:
|
||||
# - list[torch.Tensor]:
|
||||
# with shape (num_image, num_tiles, 3, image_res, image_res)
|
||||
# - torch.Tensor:
|
||||
# with shape (bs, num_image, num_tiles, 3, image_res, image_res)
|
||||
pixel_values: Optional[Union[List[List[torch.Tensor]],
|
||||
List[torch.Tensor],
|
||||
pixel_values: Optional[Union[list[list[torch.Tensor]],
|
||||
list[torch.Tensor],
|
||||
torch.Tensor]] = kwargs.pop(
|
||||
"pixel_values", None)
|
||||
image_embeds: Optional[Union[List[List[torch.Tensor]],
|
||||
List[torch.Tensor],
|
||||
image_embeds: Optional[Union[list[list[torch.Tensor]],
|
||||
list[torch.Tensor],
|
||||
torch.Tensor]] = kwargs.pop(
|
||||
"image_embeds", None)
|
||||
aspect_ratio_ids: Optional[Union[List[List[torch.Tensor]],
|
||||
List[torch.Tensor],
|
||||
aspect_ratio_ids: Optional[Union[list[list[torch.Tensor]],
|
||||
list[torch.Tensor],
|
||||
torch.Tensor]] = kwargs.pop(
|
||||
"aspect_ratio_ids", None)
|
||||
aspect_ratio_mask: Optional[Union[List[List[torch.Tensor]],
|
||||
List[torch.Tensor],
|
||||
aspect_ratio_mask: Optional[Union[list[list[torch.Tensor]],
|
||||
list[torch.Tensor],
|
||||
torch.Tensor]] = kwargs.pop(
|
||||
"aspect_ratio_mask", None)
|
||||
|
||||
@ -1293,10 +1293,10 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
|
||||
def _get_and_validate_encoder_lens(
|
||||
self,
|
||||
encoder_seq_lens: List[int],
|
||||
num_tiles: List[List[int]],
|
||||
encoder_seq_lens: list[int],
|
||||
num_tiles: list[list[int]],
|
||||
num_tokens_per_tile: int,
|
||||
) -> List[int]:
|
||||
) -> list[int]:
|
||||
# Get the actual number of encoder tokens for each sample.
|
||||
# Because attn_metadata.encoder_seq_lens only counts the last
|
||||
# group of images for each sample, which is used to cheat the
|
||||
@ -1318,7 +1318,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
|
||||
def flat_encoder_result(self, cross_attention_states: torch.Tensor,
|
||||
attn_metadata: AttentionMetadata,
|
||||
actual_encoder_seq_lens: List[int]):
|
||||
actual_encoder_seq_lens: list[int]):
|
||||
|
||||
cross_attention_states_flat = torch.zeros(
|
||||
sum(actual_encoder_seq_lens),
|
||||
@ -1342,8 +1342,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
self,
|
||||
image_inputs: MllamaImagePixelInputs,
|
||||
attn_metadata: AttentionMetadata,
|
||||
actual_encoder_seq_lens: List[int],
|
||||
) -> Tuple[torch.Tensor]:
|
||||
actual_encoder_seq_lens: list[int],
|
||||
) -> tuple[torch.Tensor]:
|
||||
# NOTE: llama's reference implementation runs vision model on CPU
|
||||
pixel_values = image_inputs['data']
|
||||
aspect_ratio_ids = image_inputs['aspect_ratio_ids']
|
||||
@ -1367,10 +1367,10 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
attn_metadata: AttentionMetadata,
|
||||
num_tiles: List[List[int]],
|
||||
num_tiles: list[list[int]],
|
||||
num_tokens_per_tile: int,
|
||||
dtype: torch.dtype,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
token_ids = input_ids.tolist()
|
||||
start = 0
|
||||
batch_token_ids = []
|
||||
@ -1422,7 +1422,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
**kwargs: object,
|
||||
) -> Union[Tuple, CausalLMOutputWithPast]:
|
||||
) -> Union[CausalLMOutputWithPast]:
|
||||
attn_metadata = get_forward_context().attn_metadata
|
||||
if attn_metadata.num_prefill_tokens > 0 and \
|
||||
attn_metadata.num_decode_tokens > 0:
|
||||
@ -1476,8 +1476,8 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
|
||||
return outputs
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
(".qkv_proj", ".q_proj", "q"),
|
||||
@ -1487,7 +1487,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
(".gate_up_proj", ".up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
updated_params: Set[str] = set()
|
||||
updated_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if 'patch_embedding.weight' in name:
|
||||
name = name.replace('patch_embedding.weight',
|
||||
@ -1538,7 +1538,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
tower_model="vision_model")
|
||||
|
||||
|
||||
def skip_attention_mask(sparse_mask: List[List[int]]) -> bool:
|
||||
def skip_attention_mask(sparse_mask: list[list[int]]) -> bool:
|
||||
for mask in sparse_mask:
|
||||
# Skip text-only samples.
|
||||
if len(mask) == 0:
|
||||
@ -1556,10 +1556,10 @@ def skip_attention_mask(sparse_mask: List[List[int]]) -> bool:
|
||||
|
||||
|
||||
def convert_sparse_cross_attention_mask_to_dense(
|
||||
sparse_mask: List[List[List[int]]],
|
||||
num_tiles: List[List[int]],
|
||||
lengths: List[int],
|
||||
) -> Tuple[np.ndarray, List[Tuple[int, int]]]:
|
||||
sparse_mask: list[list[list[int]]],
|
||||
num_tiles: list[list[int]],
|
||||
lengths: list[int],
|
||||
) -> tuple[np.ndarray, list[tuple[int, int]]]:
|
||||
total_length = sum(lengths)
|
||||
total_tiles = sum([sum(tiles) for tiles in num_tiles])
|
||||
dense_mask = np.zeros(shape=(total_length, total_tiles), dtype=np.int64)
|
||||
|
||||
@ -18,7 +18,7 @@
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping
|
||||
from itertools import tee
|
||||
from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
|
||||
from typing import Literal, Optional, TypedDict, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -582,7 +582,7 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> List[PromptUpdate]:
|
||||
) -> list[PromptUpdate]:
|
||||
assert (
|
||||
mm_items.get_count("image", strict=False) == 0
|
||||
or "aspect_ratios" in out_mm_kwargs
|
||||
@ -778,26 +778,26 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
|
||||
def separate_weights(
|
||||
self,
|
||||
weights: Iterable[Tuple[str, torch.Tensor]],
|
||||
weights: Iterable[tuple[str, torch.Tensor]],
|
||||
prefix: str,
|
||||
) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[
|
||||
) -> tuple[Iterable[tuple[str, torch.Tensor]], Iterable[tuple[
|
||||
str, torch.Tensor]]]:
|
||||
weights1, weights2 = tee(weights, 2)
|
||||
|
||||
def get_prefix_weights() -> Iterable[Tuple[str, torch.Tensor]]:
|
||||
def get_prefix_weights() -> Iterable[tuple[str, torch.Tensor]]:
|
||||
for name, data in weights1:
|
||||
if name.startswith(prefix):
|
||||
yield (name, data)
|
||||
|
||||
def get_other_weights() -> Iterable[Tuple[str, torch.Tensor]]:
|
||||
def get_other_weights() -> Iterable[tuple[str, torch.Tensor]]:
|
||||
for name, data in weights2:
|
||||
if not name.startswith(prefix):
|
||||
yield (name, data)
|
||||
|
||||
return get_prefix_weights(), get_other_weights()
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
@ -806,7 +806,7 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
(".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
updated_params: Set[str] = set()
|
||||
updated_params: set[str] = set()
|
||||
|
||||
# language_model is an Llama4ForCausalLM instance. We load it's
|
||||
# using llama4's load_weights routine.
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import math
|
||||
from typing import Iterable, List, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -148,7 +148,7 @@ class MLPSpeculator(nn.Module):
|
||||
previous_hidden_states: torch.Tensor,
|
||||
num_predict_tokens: int,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> List[SamplerOutput]:
|
||||
) -> list[SamplerOutput]:
|
||||
if num_predict_tokens > self.max_speculative_tokens:
|
||||
raise ValueError(f"Max speculative tokens for model is "
|
||||
f"{self.max_speculative_tokens}, but "
|
||||
@ -190,10 +190,10 @@ class MLPSpeculator(nn.Module):
|
||||
|
||||
return next_tokens
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
name = name.replace("speculator.", "")
|
||||
param = params_dict.get(name)
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from typing import Iterable, Optional, Set, Tuple
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -212,11 +213,11 @@ class ModernBertModel(nn.Module):
|
||||
eps=config.norm_eps,
|
||||
bias=config.norm_bias)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
weights = self.hf_to_vllm_mapper.apply(weights)
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
continue
|
||||
@ -280,7 +281,7 @@ class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding):
|
||||
self._pooler = CrossEncodingPooler(config, self.classifier,
|
||||
ModernBertPooler(config))
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
|
||||
self_weights = []
|
||||
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
# https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Union
|
||||
from typing import Union
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -46,17 +46,17 @@ class ModelKeys:
|
||||
|
||||
@dataclass
|
||||
class MultiModelKeys(ModelKeys):
|
||||
language_model: List[str] = field(default_factory=list)
|
||||
connector: List[str] = field(default_factory=list)
|
||||
language_model: list[str] = field(default_factory=list)
|
||||
connector: list[str] = field(default_factory=list)
|
||||
# vision tower and audio tower
|
||||
tower_model: List[str] = field(default_factory=list)
|
||||
generator: List[str] = field(default_factory=list)
|
||||
tower_model: list[str] = field(default_factory=list)
|
||||
generator: list[str] = field(default_factory=list)
|
||||
|
||||
@staticmethod
|
||||
def from_string_field(language_model: Union[str, List[str]] = None,
|
||||
connector: Union[str, List[str]] = None,
|
||||
tower_model: Union[str, List[str]] = None,
|
||||
generator: Union[str, List[str]] = None,
|
||||
def from_string_field(language_model: Union[str, list[str]] = None,
|
||||
connector: Union[str, list[str]] = None,
|
||||
tower_model: Union[str, list[str]] = None,
|
||||
generator: Union[str, list[str]] = None,
|
||||
**kwargs) -> 'MultiModelKeys':
|
||||
|
||||
def to_list(value):
|
||||
|
||||
@ -4,7 +4,7 @@ import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from dataclasses import dataclass
|
||||
from functools import cached_property, partial
|
||||
from typing import List, Optional, Set, Tuple, TypedDict, Union
|
||||
from typing import Optional, TypedDict, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -90,7 +90,7 @@ class MolmoImageInputs(TypedDict):
|
||||
|
||||
@dataclass
|
||||
class VisionBackboneConfig:
|
||||
image_default_input_size: Tuple[int, int] = (336, 336)
|
||||
image_default_input_size: tuple[int, int] = (336, 336)
|
||||
image_patch_size: int = 14
|
||||
image_pos_patch_size: int = 14
|
||||
image_emb_dim: int = 1024
|
||||
@ -267,7 +267,7 @@ class BlockCollection(nn.Module):
|
||||
for _ in range(config.image_num_layers)
|
||||
])
|
||||
|
||||
def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
|
||||
def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
|
||||
hidden_states = []
|
||||
for r in self.resblocks:
|
||||
x = r(x)
|
||||
@ -334,7 +334,7 @@ class VisionTransformer(nn.Module):
|
||||
|
||||
def forward(self,
|
||||
x: torch.Tensor,
|
||||
patch_num: Optional[int] = None) -> List[torch.Tensor]:
|
||||
patch_num: Optional[int] = None) -> list[torch.Tensor]:
|
||||
"""
|
||||
: param x: (batch_size, num_patch, n_pixels)
|
||||
"""
|
||||
@ -434,7 +434,7 @@ class MolmoAttention(nn.Module):
|
||||
)
|
||||
|
||||
def _apply_qk_norm(self, q: torch.Tensor,
|
||||
k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
if self.tp_size > 1:
|
||||
q = tensor_model_parallel_all_gather(q.contiguous())
|
||||
k = tensor_model_parallel_all_gather(k.contiguous())
|
||||
@ -570,7 +570,7 @@ class MolmoDecoderLayer(nn.Module):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
|
||||
) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
@ -596,7 +596,7 @@ class MolmoDecoderNormAfterLayer(MolmoDecoderLayer):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
|
||||
) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
|
||||
# Self Attention
|
||||
residual = hidden_states
|
||||
hidden_states = self.self_attn(
|
||||
@ -740,15 +740,15 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant):
|
||||
# image_features: (batch_size, num_image, num_patch, d_model)
|
||||
return image_features
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("merged_linear", "gate_proj", 0),
|
||||
("merged_linear", "up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
|
||||
for name, loaded_weight in weights:
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
@ -855,10 +855,10 @@ class MolmoModel(nn.Module, SupportsQuant):
|
||||
hidden_states = self.norm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
|
||||
for name, loaded_weight in weights:
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
@ -1530,7 +1530,7 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
|
||||
loader = AutoWeightsLoader(self)
|
||||
weights = _get_weights_with_merged_embedding(weights)
|
||||
@ -1548,8 +1548,8 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
|
||||
|
||||
|
||||
def _get_weights_with_merged_embedding(
|
||||
weights: Iterable[Tuple[str, torch.Tensor]]
|
||||
) -> Iterable[Tuple[str, torch.Tensor]]:
|
||||
weights: Iterable[tuple[str, torch.Tensor]]
|
||||
) -> Iterable[tuple[str, torch.Tensor]]:
|
||||
embedding_weights = {}
|
||||
for name, weight in weights:
|
||||
if "wte.embedding" in name:
|
||||
|
||||
@ -42,9 +42,10 @@
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
import math
|
||||
from collections.abc import Sequence
|
||||
from copy import deepcopy
|
||||
from functools import cached_property
|
||||
from typing import List, Optional, Sequence, Tuple, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -222,7 +223,7 @@ class MoonVisionPatchEmbed(nn.Module):
|
||||
self,
|
||||
out_dim: int,
|
||||
in_dim: int = 3,
|
||||
patch_size: Union[int, Tuple[int, int]] = (14, 14),
|
||||
patch_size: Union[int, tuple[int, int]] = (14, 14),
|
||||
pos_emb_height: int = 14,
|
||||
pos_emb_width: int = 14,
|
||||
):
|
||||
@ -526,7 +527,7 @@ def patch_merger(
|
||||
x: torch.Tensor,
|
||||
grid_hw: torch.Tensor,
|
||||
merge_kernel_size: list[int, int] = (2, 2),
|
||||
) -> List[torch.Tensor]:
|
||||
) -> list[torch.Tensor]:
|
||||
d_model = x.size(-1)
|
||||
|
||||
outputs = []
|
||||
|
||||
@ -2,7 +2,8 @@
|
||||
|
||||
# Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
|
||||
import math
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -265,10 +266,10 @@ class MPTModel(nn.Module):
|
||||
hidden_states = self.norm_f(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
@ -323,7 +324,7 @@ class MPTForCausalLM(nn.Module, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -22,7 +22,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only Nemotron model compatible with HuggingFace weights."""
|
||||
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -69,7 +70,7 @@ def _cast_if_autocast_enabled(*args):
|
||||
class NemotronLayerNorm1P(nn.LayerNorm):
|
||||
|
||||
def __init__(self,
|
||||
normalized_shape: Union[int, List[int], torch.Size],
|
||||
normalized_shape: Union[int, list[int], torch.Size],
|
||||
eps: float = 1e-5,
|
||||
elementwise_affine: bool = True,
|
||||
bias: bool = True,
|
||||
@ -133,7 +134,7 @@ class NemotronAttention(nn.Module):
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
bias: bool = False,
|
||||
@ -267,7 +268,7 @@ class NemotronDecoderLayer(nn.Module):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
@ -441,8 +442,8 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
(".qkv_proj", ".q_proj", "q"),
|
||||
@ -450,7 +451,7 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
(".qkv_proj", ".v_proj", "v"),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
|
||||
@ -22,7 +22,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only deci model compatible with HuggingFace weights."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Type, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -135,7 +136,7 @@ class DeciLMDecoderLayer(nn.Module):
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
|
||||
if self._is_no_op_attention:
|
||||
@ -168,7 +169,7 @@ class DeciModel(nn.Module):
|
||||
*,
|
||||
vllm_config: VllmConfig,
|
||||
prefix: str = "",
|
||||
layer_type: Type[DeciLMDecoderLayer] = DeciLMDecoderLayer,
|
||||
layer_type: type[DeciLMDecoderLayer] = DeciLMDecoderLayer,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
@ -260,8 +261,8 @@ class DeciModel(nn.Module):
|
||||
hidden_states, _ = self.norm(hidden_states, residual)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
(".qkv_proj", ".q_proj", "q"),
|
||||
@ -271,7 +272,7 @@ class DeciModel(nn.Module):
|
||||
(".gate_up_proj", ".up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
@ -428,8 +429,8 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["lm_head."]
|
||||
|
||||
@ -22,7 +22,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only OLMo model compatible with HuggingFace weights."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -209,7 +210,7 @@ class OlmoDecoderLayer(nn.Module):
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
|
||||
) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
|
||||
# Attention block.
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
@ -338,8 +339,8 @@ class OlmoForCausalLM(nn.Module, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -349,7 +350,7 @@ class OlmoForCausalLM(nn.Module, SupportsPP):
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
|
||||
@ -23,8 +23,9 @@
|
||||
# limitations under the License.
|
||||
"""Inference-only OLMo2 model compatible with HuggingFace weights."""
|
||||
|
||||
from collections.abc import Iterable
|
||||
from functools import partial
|
||||
from typing import Iterable, Optional, Tuple, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -135,7 +136,7 @@ class Olmo2Attention(nn.Module):
|
||||
)
|
||||
|
||||
def _apply_qk_norm(self, q: torch.Tensor,
|
||||
k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
if self.tp_size > 1:
|
||||
q = tensor_model_parallel_all_gather(q.contiguous())
|
||||
k = tensor_model_parallel_all_gather(k.contiguous())
|
||||
@ -365,7 +366,7 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
|
||||
@ -12,7 +12,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only OLMoE model compatible with HuggingFace weights."""
|
||||
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -102,7 +103,7 @@ class OlmoeAttention(nn.Module):
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 4096,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
@ -307,8 +308,8 @@ class OlmoeModel(nn.Module):
|
||||
hidden_states, _ = self.norm(hidden_states, residual)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -327,7 +328,7 @@ class OlmoeModel(nn.Module):
|
||||
num_experts=self.config.num_experts)
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
# Skip non-stacked layers and experts (experts handled below).
|
||||
@ -439,8 +440,8 @@ class OlmoeForCausalLM(nn.Module, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=["rotary_emb.inv_freq"],
|
||||
|
||||
@ -18,7 +18,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only OPT model compatible with HuggingFace weights."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -312,8 +313,8 @@ class OPTModel(nn.Module):
|
||||
intermediate_tensors,
|
||||
inputs_embeds=inputs_embeds)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -321,7 +322,7 @@ class OPTModel(nn.Module):
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
]
|
||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
@ -400,8 +401,8 @@ class OPTForCausalLM(nn.Module, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["lm_head.weight"]
|
||||
|
||||
@ -5,7 +5,8 @@
|
||||
# Copyright (c) OrionStar Inc.
|
||||
# LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE
|
||||
"""Inference-only Orion-14B model compatible with HuggingFace weights."""
|
||||
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -72,7 +73,7 @@ class OrionAttention(nn.Module):
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
@ -186,7 +187,7 @@ class OrionDecoderLayer(nn.Module):
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
# Self Attention
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
@ -259,8 +260,8 @@ class OrionModel(nn.Module):
|
||||
hidden_states = self.norm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -270,7 +271,7 @@ class OrionModel(nn.Module):
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
@ -341,8 +342,8 @@ class OrionForCausalLM(nn.Module, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=([
|
||||
|
||||
@ -17,8 +17,8 @@
|
||||
# limitations under the License.
|
||||
""" PyTorch Ovis model."""
|
||||
import math
|
||||
from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
|
||||
TypedDict, Union)
|
||||
from collections.abc import Iterable, Mapping
|
||||
from typing import Literal, Optional, TypedDict, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -211,7 +211,7 @@ class OvisImagePatchInputs(TypedDict):
|
||||
`(batch_size * (num_patches + 1))`
|
||||
"""
|
||||
|
||||
patches_per_image: List[int]
|
||||
patches_per_image: list[int]
|
||||
"""
|
||||
List of number of total patches for each image in the batch.
|
||||
This is used to restore the first two dimensions of `flat_data`.
|
||||
@ -545,8 +545,8 @@ class Ovis(nn.Module, SupportsMultiModal):
|
||||
logits = self.llm.compute_logits(hidden_states, sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Literal, Optional, Set, Tuple, TypedDict, Union
|
||||
from typing import Literal, Optional, TypedDict, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -391,7 +391,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
return self.language_model.compute_logits(hidden_states,
|
||||
sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -21,7 +21,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Inference-only persimmon model compatible with HuggingFace weights."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -260,10 +261,10 @@ class PersimmonModel(nn.Module):
|
||||
hidden_states = self.final_layernorm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters(remove_duplicate=False))
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
@ -336,7 +337,7 @@ class PersimmonForCausalLM(nn.Module, SupportsPP):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -36,7 +36,8 @@
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
"""Inference-only Phi-1.5 model compatible with HuggingFace weights."""
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -248,8 +249,8 @@ class PhiModel(nn.Module):
|
||||
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
@ -257,7 +258,7 @@ class PhiModel(nn.Module):
|
||||
("qkv_proj", "v_proj", "v")
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
|
||||
for name, loaded_weight in weights:
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
@ -348,7 +349,7 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
sampling_metadata, self.lm_head.bias)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import math
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -230,8 +231,8 @@ class Phi3SmallSelfAttention(nn.Module):
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor],
|
||||
Optional[Tuple[torch.Tensor]]]:
|
||||
) -> tuple[torch.Tensor, Optional[torch.Tensor],
|
||||
Optional[tuple[torch.Tensor]]]:
|
||||
qkv, _ = self.query_key_value(hidden_states)
|
||||
|
||||
qkv = qkv.view(qkv.shape[:-1] +
|
||||
@ -352,10 +353,10 @@ class Phi3SmallModel(nn.Module):
|
||||
hidden_states = self.final_layernorm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
loaded_params: set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
continue
|
||||
@ -454,8 +455,8 @@ class Phi3SmallForCausalLM(nn.Module, SupportsPP):
|
||||
output_hidden_states = output_hidden_states
|
||||
return output_hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["lm_head.weight"]
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
# limitations under the License.
|
||||
import re
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union
|
||||
from typing import Any, Literal, Optional, TypedDict, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@ -94,7 +94,7 @@ def _init_img_processor(hf_config: PretrainedConfig,
|
||||
|
||||
class Phi3VImagePixelInputs(TypedDict):
|
||||
type: Literal["pixel_values"]
|
||||
data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
"""
|
||||
Shape:
|
||||
`(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
|
||||
@ -113,7 +113,7 @@ class Phi3VImagePixelInputs(TypedDict):
|
||||
|
||||
class Phi3VImageEmbeddingInputs(TypedDict):
|
||||
type: Literal["image_embeds"]
|
||||
data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
|
||||
|
||||
`hidden_size` must match the hidden size of language model backbone.
|
||||
@ -571,8 +571,8 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
|
||||
return data
|
||||
|
||||
def _validate_pixel_values(
|
||||
self, data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, List[torch.Tensor]]:
|
||||
self, data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
|
||||
h = w = CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size
|
||||
expected_dims = (3, h, w)
|
||||
@ -707,8 +707,8 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
|
||||
return self.language_model.compute_logits(hidden_states,
|
||||
sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> set[str]:
|
||||
|
||||
loader = AutoWeightsLoader(self)
|
||||
autoloaded_weights = loader.load_weights(weights,
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union
|
||||
from typing import Any, Literal, Optional, TypedDict, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -392,7 +392,7 @@ class Phi4MMImageEncoder(nn.Module):
|
||||
|
||||
class Phi4MMImagePixelInputs(TypedDict):
|
||||
type: Literal["pixel_values"]
|
||||
data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
"""
|
||||
Shape:
|
||||
`(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
|
||||
@ -417,7 +417,7 @@ class Phi4MMImagePixelInputs(TypedDict):
|
||||
|
||||
class Phi4MMImageEmbeddingInputs(TypedDict):
|
||||
type: Literal["image_embeds"]
|
||||
data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
|
||||
|
||||
`hidden_size` must match the hidden size of language model backbone.
|
||||
@ -426,7 +426,7 @@ class Phi4MMImageEmbeddingInputs(TypedDict):
|
||||
|
||||
class Phi4MMAudioFeatureInputs(TypedDict):
|
||||
type: Literal["audio_features"]
|
||||
data: Union[torch.Tensor, List[torch.Tensor]]
|
||||
data: Union[torch.Tensor, list[torch.Tensor]]
|
||||
"""Shape: `(batch_size * num_audios, 80, M)"""
|
||||
|
||||
|
||||
@ -1031,7 +1031,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
|
||||
return audio_embeds
|
||||
|
||||
def _parse_and_validate_image_input(self,
|
||||
**kwargs: object) -> Optional[Dict]:
|
||||
**kwargs: object) -> Optional[dict]:
|
||||
input_image_embeds: NestedTensors = kwargs.get("input_image_embeds")
|
||||
if input_image_embeds is None:
|
||||
return None
|
||||
@ -1238,7 +1238,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
def load_weights(self, weights: Iterable[tuple[str,
|
||||
torch.Tensor]]) -> None:
|
||||
weights = ((name, data) for name, data in weights
|
||||
if "lora" not in name)
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#!/usr/bin/env python3
|
||||
import abc
|
||||
import math
|
||||
from typing import List, Literal, Optional
|
||||
from typing import Literal, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -746,7 +746,7 @@ class ConformerEncoder(TransformerEncoderBase):
|
||||
attention_group_size = attenion_heads = Multi-Query Attention
|
||||
"""
|
||||
|
||||
extra_multi_layer_output_idxs: List[int]
|
||||
extra_multi_layer_output_idxs: list[int]
|
||||
|
||||
def __init__( # pylint: disable-all
|
||||
self,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user