mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-25 21:35:19 +08:00
Standardise get_rope to use rope_parameters["partial_rotary_factor"], not rotary_dim (#30389)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
92fea56fd1
commit
cf3eacfe58
@ -99,7 +99,6 @@ def benchmark_mrope(
|
||||
# the parameters to compute the q k v size based on tp_size
|
||||
mrope_helper_class = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim,
|
||||
max_position=max_position,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_parameters=rope_parameters,
|
||||
|
||||
@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
|
||||
def benchmark(batch_size, seq_len, num_heads, provider):
|
||||
dtype = torch.bfloat16
|
||||
max_position = 8192
|
||||
base = 10000
|
||||
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
|
||||
rope_parameters = {"partial_rotary_factor": rotary_dim / head_size}
|
||||
rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
|
||||
rope = rope.to(dtype=dtype, device=device)
|
||||
cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device)
|
||||
|
||||
|
||||
@ -128,14 +128,12 @@ class TestFusedAddRMSNorm(torch.nn.Module):
|
||||
|
||||
|
||||
class TestRotaryEmbedding(torch.nn.Module):
|
||||
def __init__(self, head_dim=64, rotary_dim=None, max_position=2048, base=10000):
|
||||
def __init__(self, head_dim=64, max_position=2048, base=10000):
|
||||
super().__init__()
|
||||
self.head_dim = head_dim
|
||||
self.rotary_dim = rotary_dim or head_dim
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.rotary_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters={"rope_type": "default", "rope_theta": base},
|
||||
)
|
||||
@ -170,7 +168,6 @@ class TestRotaryEmbeddingSliceScatter(torch.nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters={"rope_type": "default", "rope_theta": base},
|
||||
)
|
||||
|
||||
@ -116,7 +116,6 @@ def test_mrope(
|
||||
|
||||
mrope_helper_class = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim,
|
||||
max_position=max_position,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_parameters=config.rope_parameters,
|
||||
@ -185,7 +184,6 @@ def test_mrope_torch_compile_tracing(
|
||||
|
||||
mrope_helper_class = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim,
|
||||
max_position=max_position,
|
||||
is_neox_style=is_neox_style,
|
||||
rope_parameters=config.rope_parameters,
|
||||
|
||||
@ -83,8 +83,12 @@ def test_rotary_embedding(
|
||||
torch.set_default_device(device)
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
|
||||
rope = get_rope(head_size, rotary_dim, max_position, is_neox_style, rope_parameters)
|
||||
rope_parameters = {
|
||||
"rope_type": "default",
|
||||
"rope_theta": rope_theta,
|
||||
"partial_rotary_factor": rotary_dim / head_size,
|
||||
}
|
||||
rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
|
||||
rope = rope.to(dtype=dtype, device=torch.get_default_device())
|
||||
|
||||
positions = torch.randint(0, max_position, (batch_size, seq_len))
|
||||
@ -150,9 +154,9 @@ def test_rope_module_cache():
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
rope_parameters["rope_theta"] = rope_theta
|
||||
rope_parameters["partial_rotary_factor"] = rotary_dim / head_size
|
||||
rope = get_rope(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
is_neox_style,
|
||||
rope_parameters,
|
||||
@ -177,9 +181,9 @@ def test_rope_module_cache():
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
rope_parameters["rope_theta"] = rope_theta
|
||||
rope_parameters["partial_rotary_factor"] = rotary_dim / head_size
|
||||
rope = get_rope(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
is_neox_style,
|
||||
rope_parameters,
|
||||
|
||||
@ -73,14 +73,28 @@ def get_field(cls: ConfigType, name: str) -> Field:
|
||||
)
|
||||
|
||||
|
||||
def getattr_iter(object: object, names: Iterable[str], default: Any) -> Any:
|
||||
def getattr_iter(
|
||||
object: object, names: Iterable[str], default: Any, warn: bool = False
|
||||
) -> Any:
|
||||
"""
|
||||
A helper function that retrieves an attribute from an object which may
|
||||
have multiple possible names. This is useful when fetching attributes from
|
||||
arbitrary `transformers.PretrainedConfig` instances.
|
||||
|
||||
In the case where the first name in `names` is the preferred name, and
|
||||
any other names are deprecated aliases, setting `warn=True` will log a
|
||||
warning when a deprecated name is used.
|
||||
"""
|
||||
for name in names:
|
||||
for i, name in enumerate(names):
|
||||
if hasattr(object, name):
|
||||
if warn and i > 0:
|
||||
logger.warning_once(
|
||||
"%s contains a deprecated attribute name '%s'. "
|
||||
"Please use the preferred attribute name '%s' instead.",
|
||||
type(object).__name__,
|
||||
name,
|
||||
names[0],
|
||||
)
|
||||
return getattr(object, name)
|
||||
return default
|
||||
|
||||
|
||||
@ -25,7 +25,6 @@ _ROPE_DICT: dict[tuple, RotaryEmbedding] = {}
|
||||
|
||||
def get_rope(
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position: int,
|
||||
is_neox_style: bool = True,
|
||||
rope_parameters: dict[str, Any] | None = None,
|
||||
@ -54,12 +53,15 @@ def get_rope(
|
||||
else:
|
||||
dual_chunk_attention_args = None
|
||||
|
||||
partial_rotary_factor = 1.0
|
||||
if rope_parameters is not None:
|
||||
partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0)
|
||||
rope_parameters = rope_parameters or {}
|
||||
base = rope_parameters.get("rope_theta", 10000)
|
||||
scaling_type = rope_parameters.get("rope_type", "default")
|
||||
partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0)
|
||||
|
||||
if partial_rotary_factor <= 0.0 or partial_rotary_factor > 1.0:
|
||||
raise ValueError(f"{partial_rotary_factor=} must be between 0.0 and 1.0")
|
||||
rotary_dim = int(head_size * partial_rotary_factor)
|
||||
|
||||
if partial_rotary_factor < 1.0:
|
||||
rotary_dim = int(rotary_dim * partial_rotary_factor)
|
||||
key = (
|
||||
head_size,
|
||||
rotary_dim,
|
||||
@ -72,7 +74,6 @@ def get_rope(
|
||||
if key in _ROPE_DICT:
|
||||
return _ROPE_DICT[key]
|
||||
|
||||
base = rope_parameters["rope_theta"] if rope_parameters else 10000
|
||||
if dual_chunk_attention_config is not None:
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
@ -88,109 +89,76 @@ def get_rope(
|
||||
dtype,
|
||||
**extra_kwargs,
|
||||
)
|
||||
elif not rope_parameters:
|
||||
rotary_emb = RotaryEmbedding(
|
||||
elif scaling_type == "default":
|
||||
if "mrope_section" in rope_parameters:
|
||||
rotary_emb = MRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
mrope_section=rope_parameters["mrope_section"],
|
||||
mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
|
||||
)
|
||||
else:
|
||||
rotary_emb = RotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
)
|
||||
elif scaling_type == "llama3":
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
low_freq_factor = rope_parameters["low_freq_factor"]
|
||||
high_freq_factor = rope_parameters["high_freq_factor"]
|
||||
original_max_position = rope_parameters["original_max_position_embeddings"]
|
||||
rotary_emb = Llama3RotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
scaling_factor,
|
||||
low_freq_factor,
|
||||
high_freq_factor,
|
||||
original_max_position,
|
||||
)
|
||||
elif scaling_type == "mllama4":
|
||||
rotary_emb = Llama4VisionRotaryEmbedding(
|
||||
head_size, rotary_dim, max_position, base, is_neox_style, dtype
|
||||
)
|
||||
else:
|
||||
scaling_type = rope_parameters["rope_type"]
|
||||
|
||||
if scaling_type == "llama3":
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
low_freq_factor = rope_parameters["low_freq_factor"]
|
||||
high_freq_factor = rope_parameters["high_freq_factor"]
|
||||
original_max_position = rope_parameters["original_max_position_embeddings"]
|
||||
rotary_emb = Llama3RotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
scaling_factor,
|
||||
low_freq_factor,
|
||||
high_freq_factor,
|
||||
original_max_position,
|
||||
)
|
||||
elif scaling_type == "mllama4":
|
||||
rotary_emb = Llama4VisionRotaryEmbedding(
|
||||
head_size, rotary_dim, max_position, base, is_neox_style, dtype
|
||||
)
|
||||
elif scaling_type == "default":
|
||||
if "mrope_section" in rope_parameters:
|
||||
rotary_emb = MRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
mrope_section=rope_parameters["mrope_section"],
|
||||
mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
|
||||
)
|
||||
else:
|
||||
rotary_emb = RotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
)
|
||||
elif scaling_type == "linear":
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
rotary_emb = LinearScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
)
|
||||
elif scaling_type == "ntk":
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
mixed_b = rope_parameters.get("mixed_b")
|
||||
rotary_emb = NTKScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
mixed_b,
|
||||
)
|
||||
elif scaling_type == "dynamic":
|
||||
if "alpha" in rope_parameters:
|
||||
scaling_alpha = rope_parameters["alpha"]
|
||||
rotary_emb = DynamicNTKAlphaRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_alpha,
|
||||
dtype,
|
||||
)
|
||||
elif "factor" in rope_parameters:
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
rotary_emb = DynamicNTKScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Dynamic rope scaling must contain either 'alpha' or 'factor' field"
|
||||
)
|
||||
elif scaling_type == "xdrope":
|
||||
elif scaling_type == "linear":
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
rotary_emb = LinearScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
)
|
||||
elif scaling_type == "ntk":
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
mixed_b = rope_parameters.get("mixed_b")
|
||||
rotary_emb = NTKScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
mixed_b,
|
||||
)
|
||||
elif scaling_type == "dynamic":
|
||||
if "alpha" in rope_parameters:
|
||||
scaling_alpha = rope_parameters["alpha"]
|
||||
rotary_emb = XDRotaryEmbedding(
|
||||
rotary_emb = DynamicNTKAlphaRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
@ -198,67 +166,66 @@ def get_rope(
|
||||
is_neox_style,
|
||||
scaling_alpha,
|
||||
dtype,
|
||||
xdrope_section=rope_parameters["xdrope_section"],
|
||||
)
|
||||
elif scaling_type == "yarn":
|
||||
elif "factor" in rope_parameters:
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
original_max_position = rope_parameters["original_max_position_embeddings"]
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
for k, v in rope_parameters.items()
|
||||
if k
|
||||
in (
|
||||
"extrapolation_factor",
|
||||
"attn_factor",
|
||||
"beta_fast",
|
||||
"beta_slow",
|
||||
"apply_yarn_scaling",
|
||||
"truncate",
|
||||
)
|
||||
}
|
||||
if "mrope_section" in rope_parameters:
|
||||
extra_kwargs.pop("apply_yarn_scaling", None)
|
||||
rotary_emb = MRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
original_max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
mrope_section=rope_parameters["mrope_section"],
|
||||
mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
|
||||
scaling_factor=scaling_factor,
|
||||
**extra_kwargs,
|
||||
)
|
||||
else:
|
||||
rotary_emb = YaRNScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
original_max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
**extra_kwargs,
|
||||
)
|
||||
elif scaling_type in ["deepseek_yarn", "deepseek_llama_scaling"]:
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
original_max_position = rope_parameters["original_max_position_embeddings"]
|
||||
# assert max_position == original_max_position * scaling_factor
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
for k, v in rope_parameters.items()
|
||||
if k
|
||||
in (
|
||||
"extrapolation_factor",
|
||||
"attn_factor",
|
||||
"beta_fast",
|
||||
"beta_slow",
|
||||
"mscale",
|
||||
"mscale_all_dim",
|
||||
)
|
||||
}
|
||||
rotary_emb = DeepseekScalingRotaryEmbedding(
|
||||
rotary_emb = DynamicNTKScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Dynamic rope scaling must contain either 'alpha' or 'factor' field"
|
||||
)
|
||||
elif scaling_type == "xdrope":
|
||||
scaling_alpha = rope_parameters["alpha"]
|
||||
rotary_emb = XDRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_alpha,
|
||||
dtype,
|
||||
xdrope_section=rope_parameters["xdrope_section"],
|
||||
)
|
||||
elif scaling_type == "yarn":
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
original_max_position = rope_parameters["original_max_position_embeddings"]
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
for k, v in rope_parameters.items()
|
||||
if k
|
||||
in (
|
||||
"extrapolation_factor",
|
||||
"attn_factor",
|
||||
"beta_fast",
|
||||
"beta_slow",
|
||||
"apply_yarn_scaling",
|
||||
"truncate",
|
||||
)
|
||||
}
|
||||
if "mrope_section" in rope_parameters:
|
||||
extra_kwargs.pop("apply_yarn_scaling", None)
|
||||
rotary_emb = MRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
original_max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
mrope_section=rope_parameters["mrope_section"],
|
||||
mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
|
||||
scaling_factor=scaling_factor,
|
||||
**extra_kwargs,
|
||||
)
|
||||
else:
|
||||
rotary_emb = YaRNScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
original_max_position,
|
||||
@ -268,28 +235,55 @@ def get_rope(
|
||||
dtype,
|
||||
**extra_kwargs,
|
||||
)
|
||||
elif scaling_type == "longrope":
|
||||
short_factor = rope_parameters["short_factor"]
|
||||
long_factor = rope_parameters["long_factor"]
|
||||
original_max_position = rope_parameters["original_max_position_embeddings"]
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
for k, v in rope_parameters.items()
|
||||
if k in ("short_mscale", "long_mscale")
|
||||
}
|
||||
rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
original_max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
short_factor,
|
||||
long_factor,
|
||||
**extra_kwargs,
|
||||
elif scaling_type in ["deepseek_yarn", "deepseek_llama_scaling"]:
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
original_max_position = rope_parameters["original_max_position_embeddings"]
|
||||
# assert max_position == original_max_position * scaling_factor
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
for k, v in rope_parameters.items()
|
||||
if k
|
||||
in (
|
||||
"extrapolation_factor",
|
||||
"attn_factor",
|
||||
"beta_fast",
|
||||
"beta_slow",
|
||||
"mscale",
|
||||
"mscale_all_dim",
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
|
||||
}
|
||||
rotary_emb = DeepseekScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
original_max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
**extra_kwargs,
|
||||
)
|
||||
elif scaling_type == "longrope":
|
||||
short_factor = rope_parameters["short_factor"]
|
||||
long_factor = rope_parameters["long_factor"]
|
||||
original_max_position = rope_parameters["original_max_position_embeddings"]
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
for k, v in rope_parameters.items()
|
||||
if k in ("short_mscale", "long_mscale")
|
||||
}
|
||||
rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
original_max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
short_factor,
|
||||
long_factor,
|
||||
**extra_kwargs,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
|
||||
_ROPE_DICT[key] = rotary_emb
|
||||
return rotary_emb
|
||||
|
||||
@ -241,7 +241,6 @@ class AfmoeAttention(nn.Module):
|
||||
if self.is_local_attention:
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config["rope_parameters"],
|
||||
is_neox_style=True,
|
||||
|
||||
@ -226,7 +226,6 @@ class ApertusAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=is_neox_style,
|
||||
|
||||
@ -314,7 +314,6 @@ class ArcticAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -189,7 +189,6 @@ class BaiChuanAttention(nn.Module):
|
||||
else:
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
)
|
||||
|
||||
@ -127,11 +127,11 @@ class BailingAttention(nn.Module):
|
||||
prefix=f"{prefix}.dense",
|
||||
)
|
||||
|
||||
self.rotary_dim = getattr(config, "rotary_dim", self.head_dim)
|
||||
rotary_dim = getattr(config, "rotary_dim", self.head_dim)
|
||||
config.rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.rotary_dim,
|
||||
max_position=config.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -178,14 +178,11 @@ class BambaAttentionDecoderLayer(nn.Module):
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
if hasattr(config, "attn_rotary_emb"):
|
||||
rotary_dim = config.attn_rotary_emb # for backward compatibility
|
||||
else:
|
||||
rotary_dim = self.head_dim # default
|
||||
rotary_dim = getattr(config, "attn_rotary_emb", self.head_dim)
|
||||
config.rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
head_size=self.head_dim,
|
||||
rotary_dim=rotary_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -314,7 +314,6 @@ class ChameleonAttention(nn.Module):
|
||||
self.k_norm = ChameleonLayerNorm((self.num_kv_heads, self.head_dim))
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
)
|
||||
|
||||
@ -99,13 +99,16 @@ class GLMAttention(nn.Module):
|
||||
# https://huggingface.co/zai-org/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
|
||||
rope_ratio = getattr(config, "rope_ratio", 1.0)
|
||||
max_positions = getattr(config, "seq_length", 8192)
|
||||
rope_parameters = {"rope_type": "default", "rope_theta": 10000 * rope_ratio}
|
||||
rope_parameters = {
|
||||
"rope_type": "default",
|
||||
"rope_theta": 10000 * rope_ratio,
|
||||
"partial_rotary_factor": 0.5,
|
||||
}
|
||||
# NOTE: zai-org/cogagent-9b-20241220 uses original_rope=False,
|
||||
# which is equivalent to is_neox_style=True
|
||||
is_neox_style = not config.original_rope
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim // 2,
|
||||
max_position=max_positions,
|
||||
rope_parameters=rope_parameters,
|
||||
is_neox_style=is_neox_style,
|
||||
|
||||
@ -175,7 +175,6 @@ class CohereAttention(nn.Module):
|
||||
)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=False,
|
||||
|
||||
@ -42,9 +42,10 @@ class GteNewModelConfig(VerifyAndUpdateConfig):
|
||||
config.hidden_act = "geglu"
|
||||
|
||||
head_dim = config.hidden_size // config.num_attention_heads
|
||||
rotary_dim = getattr(config, "rotary_emb_dim", head_dim)
|
||||
config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim
|
||||
config.rotary_kwargs = {
|
||||
"head_size": head_dim,
|
||||
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
|
||||
"max_position": config.max_position_embeddings,
|
||||
"rope_parameters": config.rope_parameters,
|
||||
}
|
||||
@ -77,9 +78,11 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
|
||||
if not model_config.enforce_eager:
|
||||
max_position = round_up(max_position, 8)
|
||||
|
||||
rotary_dim = getattr(config, "rotary_emb_dim", head_dim)
|
||||
config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim
|
||||
|
||||
config.rotary_kwargs = {
|
||||
"head_size": head_dim,
|
||||
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
|
||||
"max_position": max_position,
|
||||
"rope_parameters": config.rope_parameters,
|
||||
}
|
||||
@ -113,12 +116,10 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
|
||||
config.num_hidden_layers = config.n_layer
|
||||
|
||||
head_dim = config.hidden_size // config.num_attention_heads
|
||||
rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
|
||||
max_trained_positions = getattr(config, "max_trained_positions", 2048)
|
||||
|
||||
config.rotary_kwargs = {
|
||||
"head_size": head_dim,
|
||||
"rotary_dim": rotary_emb_dim,
|
||||
"max_position": max_trained_positions,
|
||||
"rope_parameters": config.rope_parameters,
|
||||
}
|
||||
@ -240,9 +241,10 @@ class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
|
||||
config.hidden_act = "geglu"
|
||||
|
||||
head_dim = config.hidden_size // config.num_attention_heads
|
||||
rotary_dim = getattr(config, "rotary_emb_dim", head_dim)
|
||||
config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim
|
||||
config.rotary_kwargs = {
|
||||
"head_size": head_dim,
|
||||
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
|
||||
"max_position": config.max_position_embeddings,
|
||||
"rope_parameters": config.rope_parameters,
|
||||
}
|
||||
|
||||
@ -222,7 +222,6 @@ class DbrxAttention(nn.Module):
|
||||
)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position,
|
||||
rope_parameters=rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -156,7 +156,6 @@ class DeepseekAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
@ -499,7 +498,6 @@ class DeepseekV2Attention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
qk_rope_head_dim,
|
||||
rotary_dim=qk_rope_head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=False,
|
||||
@ -1018,7 +1016,6 @@ class DeepseekV2MLAAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
qk_rope_head_dim,
|
||||
rotary_dim=qk_rope_head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=False,
|
||||
@ -1038,7 +1035,6 @@ class DeepseekV2MLAAttention(nn.Module):
|
||||
if self.is_v32:
|
||||
self.indexer_rope_emb = get_rope(
|
||||
qk_rope_head_dim,
|
||||
rotary_dim=qk_rope_head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -250,7 +250,6 @@ class Dots1Attention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
|
||||
@ -288,7 +288,6 @@ class Ernie4_5_MoeAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
is_neox_style=False,
|
||||
|
||||
@ -167,7 +167,6 @@ class ExaoneAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=is_neox_style,
|
||||
|
||||
@ -176,7 +176,6 @@ class Exaone4Attention(nn.Module):
|
||||
set_default_rope_theta(config, default_theta=1000000)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=is_neox_style,
|
||||
|
||||
@ -167,7 +167,6 @@ class FalconAttention(nn.Module):
|
||||
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
|
||||
@ -242,14 +242,11 @@ class FalconH1AttentionDecoderLayer(nn.Module):
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
if hasattr(config, "attn_rotary_emb"):
|
||||
rotary_dim = config.attn_rotary_emb # for backward compatibility
|
||||
else:
|
||||
rotary_dim = self.head_dim # default
|
||||
rotary_dim = getattr(config, "attn_rotary_emb", self.head_dim)
|
||||
config.rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
head_size=self.head_dim,
|
||||
rotary_dim=rotary_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -174,7 +174,6 @@ class GemmaAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -152,7 +152,6 @@ class Gemma2Attention(nn.Module):
|
||||
)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -176,7 +176,6 @@ class Gemma3Attention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -384,7 +384,6 @@ class Gemma3nAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -81,7 +81,6 @@ class Glm4Attention(nn.Module):
|
||||
config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
|
||||
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
||||
self.head_dim = head_dim or hidden_size // self.total_num_heads
|
||||
self.rotary_dim = self.head_dim
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = self.head_dim**-0.5
|
||||
@ -103,7 +102,6 @@ class Glm4Attention(nn.Module):
|
||||
)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.rotary_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=False,
|
||||
|
||||
@ -678,9 +678,9 @@ class Glm4vVisionTransformer(nn.Module):
|
||||
head_dim = self.hidden_size // self.num_heads
|
||||
self.rotary_pos_emb = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim // 2,
|
||||
max_position=8192,
|
||||
is_neox_style=True,
|
||||
rope_parameters={"partial_rotary_factor": 0.5},
|
||||
)
|
||||
self.blocks = nn.ModuleList(
|
||||
[
|
||||
|
||||
@ -285,7 +285,6 @@ class Glm4MoeAttention(nn.Module):
|
||||
config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
|
||||
@ -95,12 +95,13 @@ class GPTJAttention(nn.Module):
|
||||
scaling = self.head_size**-0.5
|
||||
assert getattr(config, "rotary", True)
|
||||
assert config.rotary_dim % 2 == 0
|
||||
rope_parameters = getattr(config, "rope_parameters", {})
|
||||
rope_parameters["partial_rotary_factor"] = config.rotary_dim / self.head_size
|
||||
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_size,
|
||||
rotary_dim=config.rotary_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=getattr(config, "rope_parameters", None),
|
||||
rope_parameters=rope_parameters,
|
||||
is_neox_style=False,
|
||||
)
|
||||
self.attn = Attention(
|
||||
|
||||
@ -92,7 +92,6 @@ class GPTNeoXAttention(nn.Module):
|
||||
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_size,
|
||||
rotary_dim=self.head_size,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
|
||||
@ -67,7 +67,6 @@ class OAIAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=config.max_position_embeddings,
|
||||
dtype=torch.float32,
|
||||
rope_parameters={
|
||||
|
||||
@ -160,7 +160,6 @@ class GraniteAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
|
||||
@ -190,7 +190,6 @@ class GraniteMoeAttention(nn.Module):
|
||||
)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters=rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -271,7 +271,6 @@ class GraniteMoeHybridAttention(nn.Module):
|
||||
if config.position_embedding_type == "rope":
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=config.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -181,7 +181,6 @@ class Grok1Attention(nn.Module):
|
||||
)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters=rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -199,7 +199,6 @@ class HunYuanAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
@ -305,7 +304,6 @@ class HunYuanCrossAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -140,7 +140,6 @@ class InternLM2Attention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
)
|
||||
|
||||
@ -143,7 +143,6 @@ class Lfm2Attention(nn.Module):
|
||||
)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -236,7 +236,6 @@ class Lfm2MoeAttention(nn.Module):
|
||||
)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -259,7 +259,6 @@ class LlamaAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=getattr(config, "rope_parameters", None),
|
||||
is_neox_style=is_neox_style,
|
||||
|
||||
@ -243,7 +243,6 @@ class Llama4Attention(nn.Module):
|
||||
self.rotary_emb = (
|
||||
get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=is_neox_style,
|
||||
|
||||
@ -277,7 +277,6 @@ class MiniCPMAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
)
|
||||
|
||||
@ -120,7 +120,6 @@ class MiniCPM3Attention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.qk_rope_head_dim,
|
||||
rotary_dim=self.qk_rope_head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
|
||||
@ -199,9 +199,13 @@ class MiniMaxM2Attention(nn.Module):
|
||||
prefix=f"{prefix}.o_proj",
|
||||
)
|
||||
|
||||
if (
|
||||
rope_parameters is not None
|
||||
and "partial_rotary_factor" not in rope_parameters
|
||||
):
|
||||
rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
)
|
||||
|
||||
@ -187,7 +187,6 @@ class MiniMaxText01Attention(nn.Module):
|
||||
num_heads: int,
|
||||
head_dim: int,
|
||||
num_kv_heads: int,
|
||||
rotary_dim: int,
|
||||
max_position: int = 4096 * 32,
|
||||
rope_parameters: dict | None = None,
|
||||
sliding_window: int | None = None,
|
||||
@ -245,7 +244,6 @@ class MiniMaxText01Attention(nn.Module):
|
||||
)
|
||||
self.rotary_emb = get_rope(
|
||||
head_size=self.head_dim,
|
||||
rotary_dim=rotary_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters=rope_parameters,
|
||||
is_neox_style=True,
|
||||
@ -290,6 +288,8 @@ class MiniMaxText01DecoderLayer(nn.Module):
|
||||
head_dim = getattr(config, "head_dim", None)
|
||||
if head_dim is None:
|
||||
head_dim = config.hidden_size // config.num_attention_heads
|
||||
rotary_dim = getattr(config, "rotary_dim", head_dim)
|
||||
config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim
|
||||
if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int):
|
||||
max_position_embeddings = min(
|
||||
config.max_position_embeddings, config.max_model_len
|
||||
@ -321,9 +321,6 @@ class MiniMaxText01DecoderLayer(nn.Module):
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=config.num_attention_heads,
|
||||
head_dim=head_dim,
|
||||
rotary_dim=config.rotary_dim
|
||||
if hasattr(config, "rotary_dim")
|
||||
else head_dim,
|
||||
num_kv_heads=config.num_key_value_heads,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
|
||||
@ -206,7 +206,6 @@ class MixtralAttention(nn.Module):
|
||||
)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -295,11 +295,11 @@ class Llama4VisionAttention(nn.Module):
|
||||
rope_parameters = {
|
||||
"rope_type": "mllama4",
|
||||
"rope_theta": config.rope_parameters["rope_theta"],
|
||||
"partial_rotary_factor": 0.5,
|
||||
}
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
head_size=self.head_dim,
|
||||
rotary_dim=config.hidden_size // config.num_attention_heads // 2,
|
||||
# number of image patches
|
||||
max_position=(config.image_size // config.patch_size) ** 2,
|
||||
rope_parameters=rope_parameters,
|
||||
|
||||
@ -105,7 +105,6 @@ class ModernBertAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
head_size=self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=config.max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
dtype=torch.float16,
|
||||
|
||||
@ -433,7 +433,6 @@ class MolmoAttention(nn.Module):
|
||||
# Rotary embeddings.
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
|
||||
@ -199,7 +199,6 @@ class NemotronAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
|
||||
@ -118,7 +118,6 @@ class DeciLMAttention(LlamaAttention):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=is_neox_style,
|
||||
|
||||
@ -102,7 +102,6 @@ class OlmoAttention(nn.Module):
|
||||
# Rotary embeddings.
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
|
||||
@ -146,7 +146,6 @@ class Olmo2Attention(nn.Module):
|
||||
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
)
|
||||
|
||||
@ -171,7 +171,6 @@ class OlmoeAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -352,7 +352,6 @@ class OpenPanguMLAAttention(nn.Module):
|
||||
}
|
||||
self.rotary_emb = get_rope(
|
||||
qk_rope_head_dim,
|
||||
rotary_dim=qk_rope_head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
is_neox_style=False,
|
||||
@ -525,7 +524,6 @@ class OpenPanguEmbeddedAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=is_neox_style,
|
||||
|
||||
@ -135,7 +135,6 @@ class OrionAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
)
|
||||
|
||||
@ -166,7 +166,6 @@ class OuroAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters=config.rope_parameters,
|
||||
dual_chunk_attention_config=dual_chunk_attention_config,
|
||||
|
||||
@ -134,7 +134,6 @@ class PersimmonAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
|
||||
@ -84,19 +84,18 @@ class PhiAttention(nn.Module):
|
||||
prefix: str = "",
|
||||
):
|
||||
super().__init__()
|
||||
self.total_num_heads = config.num_attention_heads
|
||||
self.hidden_size = config.hidden_size
|
||||
self.head_size = self.hidden_size // self.total_num_heads
|
||||
self.head_size = self.hidden_size // config.num_attention_heads
|
||||
|
||||
tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
|
||||
assert self.total_num_heads % tensor_model_parallel_world_size == 0
|
||||
self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
|
||||
assert config.num_attention_heads % tensor_model_parallel_world_size == 0
|
||||
self.num_heads = config.num_attention_heads // tensor_model_parallel_world_size
|
||||
|
||||
# pylint: disable=C0103
|
||||
self.qkv_proj = QKVParallelLinear(
|
||||
self.hidden_size,
|
||||
self.head_size,
|
||||
self.total_num_heads,
|
||||
config.num_attention_heads,
|
||||
bias=True,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.qkv_proj",
|
||||
@ -109,13 +108,10 @@ class PhiAttention(nn.Module):
|
||||
)
|
||||
|
||||
scaling = self.head_size**-0.5
|
||||
rotary_dim = config.hidden_size // config.num_attention_heads
|
||||
assert rotary_dim % 2 == 0
|
||||
|
||||
max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_size,
|
||||
rotary_dim=rotary_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
|
||||
@ -352,7 +352,6 @@ class PhiMoEAttention(nn.Module):
|
||||
)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters=rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -574,7 +574,6 @@ class Plamo2AttentionMixer(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
|
||||
@ -179,7 +179,6 @@ class Plamo3AttentionMixer(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters=rope_parameters,
|
||||
)
|
||||
|
||||
@ -114,7 +114,6 @@ class QWenAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
)
|
||||
|
||||
@ -164,7 +164,6 @@ class Qwen2Attention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters=rope_parameters,
|
||||
dual_chunk_attention_config=dual_chunk_attention_config,
|
||||
|
||||
@ -624,9 +624,9 @@ class Qwen2_5_VisionTransformer(nn.Module):
|
||||
head_dim = self.hidden_size // self.num_heads
|
||||
self.rotary_pos_emb = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim // 2,
|
||||
max_position=8192,
|
||||
is_neox_style=True,
|
||||
rope_parameters={"partial_rotary_factor": 0.5},
|
||||
)
|
||||
|
||||
self.attn_backend = get_vit_attn_backend(
|
||||
|
||||
@ -244,7 +244,6 @@ class Qwen2MoeAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
dual_chunk_attention_config=dual_chunk_attention_config,
|
||||
|
||||
@ -621,9 +621,9 @@ class Qwen2VisionTransformer(nn.Module):
|
||||
head_dim = embed_dim // num_heads
|
||||
self.rotary_pos_emb = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim // 2,
|
||||
max_position=8192,
|
||||
is_neox_style=True,
|
||||
rope_parameters={"partial_rotary_factor": 0.5},
|
||||
)
|
||||
|
||||
self.blocks = nn.ModuleList(
|
||||
|
||||
@ -111,7 +111,6 @@ class Qwen3Attention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters=rope_parameters,
|
||||
dual_chunk_attention_config=dual_chunk_attention_config,
|
||||
|
||||
@ -269,7 +269,6 @@ class Qwen3MoeAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
dual_chunk_attention_config=dual_chunk_attention_config,
|
||||
|
||||
@ -747,7 +747,6 @@ class Qwen3NextAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
head_size=self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=config.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
dual_chunk_attention_config=self.dual_chunk_attention_config,
|
||||
|
||||
@ -333,9 +333,9 @@ class Qwen3Omni_VisionTransformer(nn.Module):
|
||||
head_dim = self.hidden_size // self.num_heads
|
||||
self.rotary_pos_emb = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim // 2,
|
||||
max_position=8192,
|
||||
is_neox_style=True,
|
||||
rope_parameters={"partial_rotary_factor": 0.5},
|
||||
)
|
||||
|
||||
self.blocks = nn.ModuleList(
|
||||
|
||||
@ -340,9 +340,9 @@ class Qwen3_VisionTransformer(nn.Module):
|
||||
head_dim = self.hidden_size // self.num_heads
|
||||
self.rotary_pos_emb = get_rope(
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim // 2,
|
||||
max_position=8192,
|
||||
is_neox_style=True,
|
||||
rope_parameters={"partial_rotary_factor": 0.5},
|
||||
)
|
||||
|
||||
self.merger = Qwen3_VisionPatchMerger(
|
||||
|
||||
@ -161,7 +161,6 @@ class SeedOssAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position,
|
||||
rope_parameters=rope_parameters,
|
||||
)
|
||||
|
||||
@ -160,7 +160,6 @@ class SolarAttention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
)
|
||||
|
||||
@ -148,7 +148,6 @@ class StablelmAttention(nn.Module):
|
||||
)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.config.max_position_embeddings,
|
||||
rope_parameters=self.config.rope_parameters,
|
||||
)
|
||||
|
||||
@ -112,7 +112,6 @@ class Starcoder2Attention(nn.Module):
|
||||
)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=self.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -196,7 +196,6 @@ class Step3TextAttention(nn.Module):
|
||||
)
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embedding,
|
||||
rope_parameters=rope_parameters,
|
||||
)
|
||||
|
||||
@ -230,7 +230,6 @@ class Zamba2Attention(nn.Module):
|
||||
if config.use_mem_rope:
|
||||
self.rotary_emb = get_rope(
|
||||
head_size=self.attention_head_dim,
|
||||
rotary_dim=self.attention_head_dim,
|
||||
max_position=config.max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=True,
|
||||
|
||||
@ -306,8 +306,13 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
|
||||
"""Provide backwards compatibility for RoPE."""
|
||||
from vllm.config.utils import getattr_iter
|
||||
|
||||
rope_theta_names = ("rope_theta", "rotary_emb_base")
|
||||
rope_theta = getattr_iter(config, rope_theta_names, None)
|
||||
# Older custom models may use non-standard field names
|
||||
# which need patching for both Transformers v4 and v5.
|
||||
names = ["rope_theta", "rotary_emb_base"]
|
||||
rope_theta = getattr_iter(config, names, None, warn=True)
|
||||
names = ["partial_rotary_factor", "rotary_pct", "rotary_emb_fraction"]
|
||||
partial_rotary_factor = getattr_iter(config, names, None, warn=True)
|
||||
|
||||
if Version(version("transformers")) < Version("5.0.0.dev0"):
|
||||
# Transformers v4 installed, legacy config fields may be present
|
||||
if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
|
||||
@ -316,14 +321,18 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
|
||||
if not hasattr(config, "rope_parameters"):
|
||||
config.rope_parameters = {"rope_type": "default"}
|
||||
config.rope_parameters["rope_theta"] = rope_theta
|
||||
partial_rotary_factor_names = ("partial_rotary_factor", "rotary_pct")
|
||||
partial_rotary_factor = getattr_iter(config, partial_rotary_factor_names, None)
|
||||
if partial_rotary_factor is not None:
|
||||
if not hasattr(config, "rope_parameters"):
|
||||
config.rope_parameters = {"rope_type": "default"}
|
||||
config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
|
||||
elif rope_theta is not None or hasattr(config, "rope_parameters"):
|
||||
# Transformers v5 installed
|
||||
# Patch these fields in case they used non-standard names
|
||||
if rope_theta is not None:
|
||||
config.rope_theta = rope_theta
|
||||
if partial_rotary_factor is not None:
|
||||
config.partial_rotary_factor = partial_rotary_factor
|
||||
# Standardize and validate RoPE parameters
|
||||
config.standardize_rope_params()
|
||||
config.validate_rope()
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user