Update rope_scaling to rope_parameters in preparation for Transformers v5 (#28542)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-11-19 18:06:36 +01:00 committed by GitHub
parent d44e9df7d4
commit a8b70304d6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
104 changed files with 542 additions and 910 deletions

View File

@ -872,12 +872,12 @@ steps:
optional: true
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
- pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
- pytest -v -s tests/models/test_transformers.py
# - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

View File

@ -6,7 +6,7 @@
#
# The CSV file (named with current date/time) contains these columns:
# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99,
# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
# speedup
#
@ -86,9 +86,8 @@ def benchmark_mrope(
num_heads: int,
num_kv_heads: int,
max_position: int = 8192,
rope_theta: float = 10000,
is_neox_style: bool = True,
rope_scaling: dict[str, Any] = None,
rope_parameters: dict[str, Any] | None = None,
dtype: torch.dtype = torch.bfloat16,
seed: int = 0,
warmup_iter: int = 10,
@ -102,9 +101,8 @@ def benchmark_mrope(
head_size=head_dim,
rotary_dim=head_dim,
max_position=max_position,
base=rope_theta,
is_neox_style=is_neox_style,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
dtype=dtype,
).to(device=device)
@ -203,9 +201,8 @@ def benchmark_mrope(
num_kv_heads,
head_dim,
max_position,
rope_theta,
is_neox_style,
str(rope_scaling),
str(rope_parameters),
str(dtype).split(".")[-1],
torch_stats["mean"],
torch_stats["median"],
@ -255,9 +252,8 @@ if __name__ == "__main__":
"num_kv_heads",
"head_dim",
"max_position",
"rope_theta",
"is_neox_style",
"rope_scaling",
"rope_parameters",
"dtype",
"torch_mean",
"torch_median",
@ -303,7 +299,7 @@ if __name__ == "__main__":
q_size = num_heads * head_dim
kv_size = num_kv_heads * head_dim
is_neox_style = True
rope_theta = config.rope_theta
rope_parameters = config.rope_parameters
max_position = config.max_position_embeddings
for num_tokens in num_tokens_list:
@ -315,9 +311,8 @@ if __name__ == "__main__":
num_heads=num_heads,
num_kv_heads=num_kv_heads,
max_position=max_position,
rope_theta=rope_theta,
is_neox_style=is_neox_style,
rope_scaling=config.rope_scaling,
rope_parameters=rope_parameters,
dtype=getattr(torch, args.dtype),
seed=args.seed,
warmup_iter=args.warmup_iter,

View File

@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This script demonstrates how to extend the context length
of a Qwen model using the YARN method (rope_scaling)
of a Qwen model using the YARN method (rope_parameters)
and run a simple chat example.
Usage:
@ -19,8 +19,8 @@ def create_llm():
# Use yarn to extend context
hf_overrides = {
"rope_theta": rope_theta,
"rope_scaling": {
"rope_parameters": {
"rope_theta": rope_theta,
"rope_type": "yarn",
"factor": factor,
"original_max_position_embeddings": original_max_position_embeddings,

View File

@ -137,7 +137,7 @@ class TestRotaryEmbedding(torch.nn.Module):
self.head_dim,
rotary_dim=self.rotary_dim,
max_position=max_position,
base=base,
rope_parameters={"rope_type": "default", "rope_theta": base},
)
def forward(self, positions, q, k):
@ -172,7 +172,7 @@ class TestRotaryEmbeddingSliceScatter(torch.nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position,
base=base,
rope_parameters={"rope_type": "default", "rope_theta": base},
)
def forward(self, positions, hidden_states):

View File

@ -5,11 +5,11 @@ from typing import NamedTuple
import pytest
import torch
from packaging.version import Version
from transformers import AutoConfig
from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@ -98,8 +98,7 @@ def test_mrope(
atol = model_info.atol
rtol = model_info.rtol
config = AutoConfig.from_pretrained(model_name)
config = config.get_text_config()
config = get_config(model_name, False).get_text_config()
# get the model config
total_num_kv_heads = config.num_key_value_heads
@ -113,7 +112,6 @@ def test_mrope(
)
is_neox_style = True
rope_theta = config.rope_theta
max_position = config.max_position_embeddings
partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
rotary_dim = int(head_dim * partial_rotary_factor)
@ -122,9 +120,8 @@ def test_mrope(
head_size=head_dim,
rotary_dim=rotary_dim,
max_position=max_position,
base=rope_theta,
is_neox_style=is_neox_style,
rope_scaling=config.rope_scaling,
rope_parameters=config.rope_parameters,
dtype=dtype,
).to(device=device)
@ -173,8 +170,7 @@ def test_mrope_torch_compile_tracing(
atol = model_info.atol
rtol = model_info.rtol
config = AutoConfig.from_pretrained(model_name)
config = config.get_text_config()
config = get_config(model_name, False).get_text_config()
# get the model config
total_num_kv_heads = config.num_key_value_heads
@ -187,7 +183,6 @@ def test_mrope_torch_compile_tracing(
else config.hidden_size // total_num_heads
)
is_neox_style = True
rope_theta = config.rope_theta
max_position = config.max_position_embeddings
partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
rotary_dim = int(head_dim * partial_rotary_factor)
@ -196,9 +191,8 @@ def test_mrope_torch_compile_tracing(
head_size=head_dim,
rotary_dim=rotary_dim,
max_position=max_position,
base=rope_theta,
is_neox_style=is_neox_style,
rope_scaling=config.rope_scaling,
rope_parameters=config.rope_parameters,
dtype=dtype,
).to(device=device)

View File

@ -74,7 +74,7 @@ def test_rotary_embedding(
device: str,
use_key: bool,
max_position: int = 8192,
base: float = 10000,
rope_theta: float = 10000,
) -> None:
if rotary_dim is None:
rotary_dim = head_size
@ -83,7 +83,8 @@ def test_rotary_embedding(
torch.set_default_device(device)
if rotary_dim is None:
rotary_dim = head_size
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
rope = get_rope(head_size, rotary_dim, max_position, is_neox_style, rope_parameters)
rope = rope.to(dtype=dtype, device=torch.get_default_device())
positions = torch.randint(0, max_position, (batch_size, seq_len))
@ -120,9 +121,9 @@ def test_rotary_embedding(
@torch.inference_mode()
def test_rope_module_cache():
MAX_POSITIONS = [123, 1234]
BASES = [10000, 1000000]
ROPE_SCALINGS = (
None,
ROPE_THETAS = [10000, 1000000]
ROPE_PARAMETERS = (
{"rope_type": "default"},
{"rope_type": "linear", "factor": (1,)},
{"rope_type": "dynamic", "factor": 1},
)
@ -130,9 +131,9 @@ def test_rope_module_cache():
HEAD_SIZES,
ROTARY_DIMS,
MAX_POSITIONS,
BASES,
ROPE_THETAS,
IS_NEOX_STYLE,
ROPE_SCALINGS,
ROPE_PARAMETERS,
DTYPES,
)
rope_setting_id_map: dict[str, int] = {}
@ -141,20 +142,20 @@ def test_rope_module_cache():
head_size,
rotary_dim,
max_position,
base,
is_neox_stype,
rope_scaling,
rope_theta,
is_neox_style,
rope_parameters,
dtype,
) = setting
if rotary_dim is None:
rotary_dim = head_size
rope_parameters["rope_theta"] = rope_theta
rope = get_rope(
head_size,
rotary_dim,
max_position,
base,
is_neox_stype,
rope_scaling,
is_neox_style,
rope_parameters,
dtype,
)
# different settings cannot share the same rope module
@ -168,20 +169,20 @@ def test_rope_module_cache():
head_size,
rotary_dim,
max_position,
base,
is_neox_stype,
rope_scaling,
rope_theta,
is_neox_style,
rope_parameters,
dtype,
) = setting
if rotary_dim is None:
rotary_dim = head_size
rope_parameters["rope_theta"] = rope_theta
rope = get_rope(
head_size,
rotary_dim,
max_position,
base,
is_neox_stype,
rope_scaling,
is_neox_style,
rope_parameters,
dtype,
)
# check if cache take effect

View File

@ -201,7 +201,7 @@ class ModelConfig:
sliding_window: int = 128
initial_context_length: int = 4096
rope_theta: float = 150000.0
rope_scaling_factor: float = 32.0
rope_parameters_factor: float = 32.0
rope_ntk_alpha: float = 1.0
rope_ntk_beta: float = 32.0

View File

@ -1,6 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: SIM117
from typing import Any
import pytest
from ...utils import EmbedModelInfo
@ -79,8 +81,8 @@ def test_set_max_model_len_illegal(model_info, vllm_runner):
@pytest.mark.parametrize("model_info", MODELS)
def test_use_rope_scaling_legal(model_info, vllm_runner):
hf_overrides = {
"rope_theta": rope_theta,
"rope_scaling": {
"rope_parameters": {
"rope_theta": rope_theta,
"rope_type": "yarn",
"factor": factor,
"original_max_position_embeddings": original_max_position_embeddings,
@ -96,9 +98,9 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
@pytest.mark.parametrize("model_info", MODELS)
def test_use_rope_scaling_illegal(model_info, vllm_runner):
hf_overrides = {
"rope_theta": rope_theta,
"rope_scaling": {
hf_overrides: dict[str, Any] = {
"rope_parameters": {
"rope_theta": rope_theta,
"rope_type": "yarn",
"factor": factor,
"original_max_position_embeddings": original_max_position_embeddings,
@ -115,8 +117,8 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
pass
hf_overrides = {
"rope_theta": rope_theta,
"rope_scaling": {
"rope_parameters": {
"rope_theta": rope_theta,
"rope_type": "yarn",
"factor": factor,
"original_max_position_embeddings": original_max_position_embeddings,

View File

@ -249,45 +249,48 @@ def test_get_bert_tokenization_sentence_transformer_config():
def test_rope_customization():
TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}
TEST_ROPE_THETA = 16_000_000.0
LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
TEST_ROPE_PARAMETERS = {
"rope_theta": 16_000_000.0,
"rope_type": "dynamic",
"factor": 2.0,
}
LLAMA_ROPE_PARAMETERS = {"rope_theta": 500000.0, "rope_type": "default"}
LONGCHAT_ROPE_PARAMETERS = {"rope_type": "linear", "factor": 8.0}
llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct")
assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000
assert (
getattr(llama_model_config.hf_config, "rope_parameters", None)
== LLAMA_ROPE_PARAMETERS
)
assert llama_model_config.max_model_len == 8192
llama_model_config = ModelConfig(
"meta-llama/Meta-Llama-3-8B-Instruct",
hf_overrides={
"rope_scaling": TEST_ROPE_SCALING,
"rope_theta": TEST_ROPE_THETA,
},
hf_overrides={"rope_parameters": TEST_ROPE_PARAMETERS},
)
assert (
getattr(llama_model_config.hf_config, "rope_scaling", None) == TEST_ROPE_SCALING
getattr(llama_model_config.hf_config, "rope_parameters", None)
== TEST_ROPE_PARAMETERS
)
assert getattr(llama_model_config.hf_config, "rope_theta", None) == TEST_ROPE_THETA
assert llama_model_config.max_model_len == 16384
longchat_model_config = ModelConfig("lmsys/longchat-13b-16k")
# Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
# Check if LONGCHAT_ROPE_PARAMETERS entries are in longchat_model_config
assert all(
longchat_model_config.hf_config.rope_scaling.get(key) == value
for key, value in LONGCHAT_ROPE_SCALING.items()
longchat_model_config.hf_config.rope_parameters.get(key) == value
for key, value in LONGCHAT_ROPE_PARAMETERS.items()
)
assert longchat_model_config.max_model_len == 16384
longchat_model_config = ModelConfig(
"lmsys/longchat-13b-16k",
hf_overrides={
"rope_scaling": TEST_ROPE_SCALING,
"rope_parameters": TEST_ROPE_PARAMETERS,
},
)
assert (
getattr(longchat_model_config.hf_config, "rope_scaling", None)
== TEST_ROPE_SCALING
getattr(longchat_model_config.hf_config, "rope_parameters", None)
== TEST_ROPE_PARAMETERS
)
assert longchat_model_config.max_model_len == 4096

View File

@ -11,6 +11,7 @@ import torch
from pydantic import ConfigDict, SkipValidation, field_validator, model_validator
from pydantic.dataclasses import dataclass
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
from transformers.configuration_utils import ALLOWED_LAYER_TYPES
import vllm.envs as envs
from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig
@ -2100,31 +2101,32 @@ def _get_and_verify_max_len(
)
derived_max_model_len = default_max_len
rope_scaling = getattr(hf_config, "rope_scaling", None)
# In Transformers v5 rope_parameters could be TypedDict or dict[str, TypedDict].
# To simplify the verification, we convert it to dict[str, TypedDict].
rope_parameters = getattr(hf_config, "rope_parameters", None)
if rope_parameters and not set(rope_parameters.keys()).issubset(
ALLOWED_LAYER_TYPES
):
rope_parameters = {"": rope_parameters}
# NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE
# scaling, so we skip applying the scaling factor again.
if rope_scaling is not None and "gemma3" not in hf_config.model_type:
# No need to consider "type" key because of patch_rope_scaling when
# loading HF config
rope_type = rope_scaling["rope_type"]
if rope_parameters is not None and "gemma3" not in hf_config.model_type:
scaling_factor = 1.0
for rp in rope_parameters.values():
# No need to consider "type" key because of patch_rope_parameters when
# loading HF config
rope_type = rp["rope_type"]
if rope_type not in ("su", "longrope", "llama3"):
if disable_sliding_window:
# TODO(robertgshaw): Find a model that supports rope_scaling
# with sliding window to see if this case should be allowed.
raise NotImplementedError(
"Disabling sliding window is not supported for models "
"with rope_scaling. Please raise an issue so we can "
"investigate."
)
if rope_type not in ("su", "longrope", "llama3"):
# NOTE: rope_type == "default" does not define factor https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py
# NOTE: This assumes all layer types have the same scaling factor.
scaling_factor = rp.get("factor", scaling_factor)
# NOTE: rope_type == "default" does not define factor
# https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py
scaling_factor = rope_scaling.get("factor", 1.0)
if rope_type == "yarn":
derived_max_model_len = rope_scaling["original_max_position_embeddings"]
derived_max_model_len *= scaling_factor
if rope_type == "yarn":
derived_max_model_len = rp["original_max_position_embeddings"]
# Do this outside loop since all layer types should have the same scaling
derived_max_model_len *= scaling_factor
if encoder_config and "max_seq_length" in encoder_config:
derived_max_model_len = encoder_config["max_seq_length"]
@ -2134,7 +2136,9 @@ def _get_and_verify_max_len(
if max_model_len is None:
# For LongRoPE, default to original_max_position_embeddings to avoid
# performance degradation for shorter sequences
if rope_scaling is not None and rope_scaling["rope_type"] == "longrope":
if rope_parameters is not None and any(
rp["rope_type"] == "longrope" for rp in rope_parameters.values()
):
max_model_len = int(
getattr(
hf_config, "original_max_position_embeddings", derived_max_model_len
@ -2151,16 +2155,7 @@ def _get_and_verify_max_len(
# that will be bigger than derived_max_model_len. We compare user input
# with model_max_length and allow this override when it's smaller.
model_max_length = getattr(hf_config, "model_max_length", None)
if model_max_length is not None and max_model_len <= model_max_length:
if disable_sliding_window:
# TODO(robertgshaw): Find a model that has model_max_length
# with sliding window to see if this case should be allowed.
raise NotImplementedError(
"Disabling sliding window is not supported for models "
"model_max_length in the config. Please raise an issue "
"so we can investigate."
)
else:
if model_max_length is None or max_model_len > model_max_length:
msg = (
f"User-specified max_model_len ({max_model_len}) is greater "
f"than the derived max_model_len ({max_len_key}="

View File

@ -26,23 +26,23 @@ def get_rope(
head_size: int,
rotary_dim: int,
max_position: int,
base: float,
is_neox_style: bool = True,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any] | None = None,
dtype: torch.dtype | None = None,
partial_rotary_factor: float = 1.0,
dual_chunk_attention_config: dict[str, Any] | None = None,
) -> RotaryEmbedding:
if dtype is None:
dtype = torch.get_default_dtype()
if rope_scaling is not None:
if rope_parameters is not None:
# Transforms every value that is a list into a tuple for caching calls
rope_scaling_tuple = {
k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items()
rope_parameters_tuple = {
k: tuple(v) if isinstance(v, list) else v
for k, v in rope_parameters.items()
}
rope_scaling_args = tuple(rope_scaling_tuple.items())
rope_parameters_args = tuple(rope_parameters_tuple.items())
else:
rope_scaling_args = None
rope_parameters_args = None
if dual_chunk_attention_config is not None:
dual_chunk_attention_tuple = {
@ -60,15 +60,15 @@ def get_rope(
head_size,
rotary_dim,
max_position,
base,
is_neox_style,
rope_scaling_args,
rope_parameters_args,
dual_chunk_attention_args,
dtype,
)
if key in _ROPE_DICT:
return _ROPE_DICT[key]
base = rope_parameters["rope_theta"] if rope_parameters else 10000
if dual_chunk_attention_config is not None:
extra_kwargs = {
k: v
@ -84,18 +84,18 @@ def get_rope(
dtype,
**extra_kwargs,
)
elif not rope_scaling:
elif not rope_parameters:
rotary_emb = RotaryEmbedding(
head_size, rotary_dim, max_position, base, is_neox_style, dtype
)
else:
scaling_type = rope_scaling["rope_type"]
scaling_type = rope_parameters["rope_type"]
if scaling_type == "llama3":
scaling_factor = rope_scaling["factor"]
low_freq_factor = rope_scaling["low_freq_factor"]
high_freq_factor = rope_scaling["high_freq_factor"]
original_max_position = rope_scaling["original_max_position_embeddings"]
scaling_factor = rope_parameters["factor"]
low_freq_factor = rope_parameters["low_freq_factor"]
high_freq_factor = rope_parameters["high_freq_factor"]
original_max_position = rope_parameters["original_max_position_embeddings"]
rotary_emb = Llama3RotaryEmbedding(
head_size,
rotary_dim,
@ -113,7 +113,7 @@ def get_rope(
head_size, rotary_dim, max_position, base, is_neox_style, dtype
)
elif scaling_type == "default":
if "mrope_section" in rope_scaling:
if "mrope_section" in rope_parameters:
rotary_emb = MRotaryEmbedding(
head_size,
rotary_dim,
@ -121,8 +121,8 @@ def get_rope(
base,
is_neox_style,
dtype,
mrope_section=rope_scaling["mrope_section"],
mrope_interleaved=rope_scaling.get("mrope_interleaved", False),
mrope_section=rope_parameters["mrope_section"],
mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
)
else:
rotary_emb = RotaryEmbedding(
@ -134,7 +134,7 @@ def get_rope(
dtype,
)
elif scaling_type == "linear":
scaling_factor = rope_scaling["factor"]
scaling_factor = rope_parameters["factor"]
rotary_emb = LinearScalingRotaryEmbedding(
head_size,
rotary_dim,
@ -145,8 +145,8 @@ def get_rope(
dtype,
)
elif scaling_type == "ntk":
scaling_factor = rope_scaling["factor"]
mixed_b = rope_scaling.get("mixed_b", None)
scaling_factor = rope_parameters["factor"]
mixed_b = rope_parameters.get("mixed_b")
rotary_emb = NTKScalingRotaryEmbedding(
head_size,
rotary_dim,
@ -158,8 +158,8 @@ def get_rope(
mixed_b,
)
elif scaling_type == "dynamic":
if "alpha" in rope_scaling:
scaling_alpha = rope_scaling["alpha"]
if "alpha" in rope_parameters:
scaling_alpha = rope_parameters["alpha"]
rotary_emb = DynamicNTKAlphaRotaryEmbedding(
head_size,
rotary_dim,
@ -169,8 +169,8 @@ def get_rope(
scaling_alpha,
dtype,
)
elif "factor" in rope_scaling:
scaling_factor = rope_scaling["factor"]
elif "factor" in rope_parameters:
scaling_factor = rope_parameters["factor"]
rotary_emb = DynamicNTKScalingRotaryEmbedding(
head_size,
rotary_dim,
@ -185,11 +185,11 @@ def get_rope(
"Dynamic rope scaling must contain either 'alpha' or 'factor' field"
)
elif scaling_type == "yarn":
scaling_factor = rope_scaling["factor"]
original_max_position = rope_scaling["original_max_position_embeddings"]
scaling_factor = rope_parameters["factor"]
original_max_position = rope_parameters["original_max_position_embeddings"]
extra_kwargs = {
k: v
for k, v in rope_scaling.items()
for k, v in rope_parameters.items()
if k
in (
"extrapolation_factor",
@ -199,7 +199,7 @@ def get_rope(
"apply_yarn_scaling",
)
}
if "mrope_section" in rope_scaling:
if "mrope_section" in rope_parameters:
extra_kwargs.pop("apply_yarn_scaling", None)
rotary_emb = MRotaryEmbedding(
head_size,
@ -208,8 +208,8 @@ def get_rope(
base,
is_neox_style,
dtype,
mrope_section=rope_scaling["mrope_section"],
mrope_interleaved=rope_scaling.get("mrope_interleaved", False),
mrope_section=rope_parameters["mrope_section"],
mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
scaling_factor=scaling_factor,
**extra_kwargs,
)
@ -225,12 +225,12 @@ def get_rope(
**extra_kwargs,
)
elif scaling_type == "deepseek_yarn":
scaling_factor = rope_scaling["factor"]
original_max_position = rope_scaling["original_max_position_embeddings"]
scaling_factor = rope_parameters["factor"]
original_max_position = rope_parameters["original_max_position_embeddings"]
# assert max_position == original_max_position * scaling_factor
extra_kwargs = {
k: v
for k, v in rope_scaling.items()
for k, v in rope_parameters.items()
if k
in (
"extrapolation_factor",
@ -252,12 +252,12 @@ def get_rope(
**extra_kwargs,
)
elif scaling_type == "longrope":
short_factor = rope_scaling["short_factor"]
long_factor = rope_scaling["long_factor"]
original_max_position = rope_scaling["original_max_position_embeddings"]
short_factor = rope_parameters["short_factor"]
long_factor = rope_parameters["long_factor"]
original_max_position = rope_parameters["original_max_position_embeddings"]
extra_kwargs = {
k: v
for k, v in rope_scaling.items()
for k, v in rope_parameters.items()
if k in ("short_mscale", "long_mscale")
}
rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(

View File

@ -5,7 +5,6 @@
import typing
from collections.abc import Callable, Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
@ -171,8 +170,6 @@ class AfmoeAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 131072,
head_dim: int | None = None,
rms_norm_eps: float = 1e-05,
@ -202,7 +199,6 @@ class AfmoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
# Check if this is a local attention layer
@ -246,8 +242,7 @@ class AfmoeAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config["rope_parameters"],
is_neox_style=True,
)
else:
@ -303,14 +298,6 @@ class AfmoeDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
# DecoderLayers are created with `make_layers` which passes the prefix
@ -323,8 +310,6 @@ class AfmoeDecoderLayer(nn.Module):
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
head_dim=config.head_dim,
rms_norm_eps=config.rms_norm_eps,

View File

@ -27,7 +27,6 @@
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
@ -118,8 +117,6 @@ class ApertusAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
@ -155,7 +152,6 @@ class ApertusAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -176,9 +172,7 @@ class ApertusAttention(nn.Module):
prefix=f"{prefix}.o_proj",
)
self._init_rotary_emb(
config, rope_scaling=rope_scaling, quant_config=quant_config
)
self._init_rotary_emb(config, quant_config=quant_config)
sliding_window = None
if layer_types := getattr(config, "layer_types", None):
@ -224,7 +218,6 @@ class ApertusAttention(nn.Module):
def _init_rotary_emb(
self,
config: ApertusConfig,
rope_scaling: dict[str, Any] | None,
quant_config: QuantizationConfig | None,
) -> None:
is_neox_style = True
@ -236,8 +229,7 @@ class ApertusAttention(nn.Module):
self.head_dim,
rotary_dim=int(self.partial_rotary_factor * self.head_dim),
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor,
)
@ -253,14 +245,6 @@ class ApertusDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias
@ -288,8 +272,6 @@ class ApertusDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,

View File

@ -103,15 +103,6 @@ class ArceeDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
# Rotary embedding parameters (reuse LLaMA defaults)
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Determine if attention bias is needed (some variants use bias terms)
attention_bias = getattr(config, "attention_bias", False) or getattr(
@ -133,8 +124,6 @@ class ArceeDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,

View File

@ -292,7 +292,6 @@ class ArcticAttention(nn.Module):
self.kv_size = self.num_kv_heads * self.head_dim
self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
self.scaling = self.head_dim**-0.5
self.qkv_proj = QKVParallelLinear(
@ -317,7 +316,7 @@ class ArcticAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=int(self.rope_theta),
rope_parameters=config.rope_parameters,
is_neox_style=True,
)

View File

@ -136,7 +136,7 @@ class BaiChuanAttention(nn.Module):
hidden_size: int,
num_heads: int,
position_embedding: str,
rope_theta: float = 10000,
rope_parameters: dict,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
@ -150,7 +150,6 @@ class BaiChuanAttention(nn.Module):
self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
self.head_dim = hidden_size // self.total_num_heads
self.position_embedding = position_embedding
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
# pylint: disable=invalid-name
@ -192,7 +191,7 @@ class BaiChuanAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_parameters=rope_parameters,
)
self.scaling = self.head_dim**-0.5
self.attn = Attention(
@ -229,13 +228,12 @@ class BaiChuanDecoderLayer(nn.Module):
):
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.self_attn = BaiChuanAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
position_embedding=position_embedding,
rope_theta=rope_theta,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,

View File

@ -135,9 +135,8 @@ class BailingAttention(nn.Module):
self.head_dim,
rotary_dim=self.rotary_dim,
max_position=config.max_position_embeddings,
base=config.rope_theta,
rope_parameters=config.rope_parameters,
is_neox_style=True,
rope_scaling=config.rope_scaling,
partial_rotary_factor=self.partial_rotary_factor,
)

View File

@ -156,8 +156,6 @@ class BambaAttentionDecoderLayer(nn.Module):
prefix: str = "",
) -> None:
super().__init__()
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.hidden_size = config.hidden_size
tp_size = get_tensor_model_parallel_world_size()
@ -178,7 +176,6 @@ class BambaAttentionDecoderLayer(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
if hasattr(config, "partial_rotary_factor"):
@ -192,8 +189,7 @@ class BambaAttentionDecoderLayer(nn.Module):
head_size=self.head_dim,
rotary_dim=rotary_dim,
max_position=max_position_embeddings,
rope_scaling=rope_scaling,
base=rope_theta,
rope_parameters=config.rope_parameters,
is_neox_style=True,
dtype=torch.get_default_dtype(), # see impl of get_rope
)

View File

@ -265,8 +265,7 @@ class ChameleonAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any],
max_position_embeddings: int = 4096,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
@ -293,7 +292,6 @@ class ChameleonAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -318,8 +316,7 @@ class ChameleonAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
)
self.attn = Attention(
@ -369,14 +366,6 @@ class ChameleonDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
self.self_attn = ChameleonAttention(
@ -385,8 +374,7 @@ class ChameleonDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=False,
@ -439,14 +427,6 @@ class ChameleonSwinDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
self.self_attn = ChameleonAttention(
@ -455,8 +435,7 @@ class ChameleonSwinDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=False,

View File

@ -99,6 +99,7 @@ class GLMAttention(nn.Module):
# https://huggingface.co/zai-org/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
rope_ratio = getattr(config, "rope_ratio", 1.0)
max_positions = getattr(config, "seq_length", 8192)
rope_parameters = {"rope_type": "default", "rope_theta": 10000 * rope_ratio}
# NOTE: zai-org/cogagent-9b-20241220 uses original_rope=False,
# which is equivalent to is_neox_style=True
is_neox_style = not config.original_rope
@ -106,7 +107,7 @@ class GLMAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim // 2,
max_position=max_positions,
base=10000 * rope_ratio,
rope_parameters=rope_parameters,
is_neox_style=is_neox_style,
)
self.attn = Attention(

View File

@ -156,8 +156,6 @@ class CohereAttention(nn.Module):
self.max_position_embeddings = getattr(
config, "model_max_length", None
) or getattr(config, "max_position_embeddings", 8192)
self.rope_theta = config.rope_theta
self.rope_scaling = getattr(config, "rope_scaling", None)
self.use_qk_norm = getattr(config, "use_qk_norm", False)
self.qkv_proj = QKVParallelLinear(
self.hidden_size,
@ -179,8 +177,7 @@ class CohereAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_scaling=self.rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=False,
)

View File

@ -8,6 +8,7 @@ import vllm.envs as envs
from vllm.logger import init_logger
from vllm.model_executor.models import ModelRegistry
from vllm.platforms import current_platform
from vllm.transformers_utils.config import set_default_rope_theta
from vllm.utils.math_utils import cdiv, round_up
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
@ -46,8 +47,7 @@ class GteNewModelConfig(VerifyAndUpdateConfig):
"head_size": head_dim,
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
"max_position": config.max_position_embeddings,
"base": config.rope_theta,
"rope_scaling": getattr(config, "rope_scaling", None),
"rope_parameters": config.rope_parameters,
}
@ -78,12 +78,13 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
if not model_config.enforce_eager:
max_position = round_up(max_position, 8)
set_default_rope_theta(config, default_theta=config.rotary_emb_base)
config.rotary_kwargs = {
"head_size": head_dim,
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
"max_position": max_position,
"base": getattr(config, "rope_theta", config.rotary_emb_base),
"rope_scaling": getattr(config, "rope_scaling", None),
"rope_parameters": config.rope_parameters,
}
@ -117,18 +118,20 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
head_dim = config.hidden_size // config.num_attention_heads
rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
max_trained_positions = getattr(config, "max_trained_positions", 2048)
set_default_rope_theta(config, default_theta=config.rotary_emb_base)
config.rotary_kwargs = {
"head_size": head_dim,
"rotary_dim": rotary_emb_dim,
"max_position": max_trained_positions,
"base": getattr(config, "rope_theta", config.rotary_emb_base),
"rope_scaling": getattr(config, "rope_scaling", None),
"rope_parameters": config.rope_parameters,
}
# we ignore config.rotary_scaling_factor so that for datasets shorter
# than max_trained_positions 2048, the results are consistent
# with SentenceTransformer.
# The context extension uses vllm style rope_theta and rope_scaling.
# The context extension uses vllm style rope_theta and rope_parameters.
# See #17785 #18755
if (
not vllm_config.model_config.hf_overrides
@ -172,7 +175,7 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
if hasattr(hf_text_config, "max_model_len"):
delattr(hf_text_config, "max_model_len")
hf_text_config.max_position_embeddings = max_trained_positions
hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"]
hf_text_config.rope_parameters = config.rotary_kwargs["rope_parameters"]
# The priority of sentence_bert_config.json is higher
# than max_position_embeddings
@ -246,8 +249,7 @@ class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):
"head_size": head_dim,
"rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
"max_position": config.max_position_embeddings,
"base": config.rope_theta,
"rope_scaling": getattr(config, "rope_scaling", None),
"rope_parameters": config.rope_parameters,
}

View File

@ -197,7 +197,10 @@ class DbrxAttention(nn.Module):
self.head_dim = self.d_model // self.total_num_heads
self.total_num_kv_heads = config.attn_config.kv_n_heads
self.clip_qkv = config.attn_config.clip_qkv
self.rope_theta = config.attn_config.rope_theta
rope_parameters = {
"rope_type": "default",
"rope_theta": int(config.attn_config.rope_theta),
}
self.max_position = config.max_seq_len
# pylint: disable=invalid-name
@ -221,7 +224,7 @@ class DbrxAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position,
base=int(self.rope_theta),
rope_parameters=rope_parameters,
is_neox_style=True,
)

View File

@ -27,7 +27,6 @@
import typing
from collections.abc import Callable, Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
@ -111,8 +110,6 @@ class DeepseekAttention(nn.Module):
config: DeepseekV2Config | DeepseekV3Config,
hidden_size: int,
num_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
@ -139,7 +136,6 @@ class DeepseekAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -162,8 +158,7 @@ class DeepseekAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
)
self.attn = Attention(
self.num_heads,
@ -409,8 +404,6 @@ class DeepseekV2Attention(nn.Module):
v_head_dim: int,
q_lora_rank: int,
kv_lora_rank: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
@ -430,7 +423,6 @@ class DeepseekV2Attention(nn.Module):
assert num_heads % tp_size == 0
self.num_local_heads = num_heads // tp_size
self.scaling = self.qk_head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
assert topk_indices_buffer is None, (
"topk_indices_buffer is not \
@ -485,21 +477,20 @@ class DeepseekV2Attention(nn.Module):
quant_config=quant_config,
prefix=f"{prefix}.o_proj",
)
if rope_scaling:
rope_scaling["rope_type"] = "deepseek_yarn"
if config.rope_parameters["rope_type"] != "default":
config.rope_parameters["rope_type"] = "deepseek_yarn"
self.rotary_emb = get_rope(
qk_rope_head_dim,
rotary_dim=qk_rope_head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=False,
)
if rope_scaling:
mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
scaling_factor = rope_scaling["factor"]
if config.rope_parameters["rope_type"] != "default":
mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
scaling_factor = config.rope_parameters["factor"]
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
self.scaling = self.scaling * mscale * mscale
@ -903,8 +894,6 @@ class DeepseekV2MLAAttention(nn.Module):
v_head_dim: int,
q_lora_rank: int | None,
kv_lora_rank: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
@ -927,7 +916,6 @@ class DeepseekV2MLAAttention(nn.Module):
self.num_local_heads = num_heads // tp_size
self.scaling = self.qk_head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
if self.q_lora_rank is not None:
@ -981,19 +969,18 @@ class DeepseekV2MLAAttention(nn.Module):
prefix=f"{prefix}.o_proj",
)
if rope_scaling:
rope_scaling["rope_type"] = "deepseek_yarn"
if config.rope_parameters["rope_type"] != "default":
config.rope_parameters["rope_type"] = "deepseek_yarn"
self.rotary_emb = get_rope(
qk_rope_head_dim,
rotary_dim=qk_rope_head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=False,
)
if rope_scaling:
mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
scaling_factor = rope_scaling["factor"]
if config.rope_parameters["rope_type"] != "default":
mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
scaling_factor = config.rope_parameters["factor"]
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
self.scaling = self.scaling * mscale * mscale
@ -1073,8 +1060,6 @@ class DeepseekV2DecoderLayer(nn.Module):
parallel_config = vllm_config.parallel_config
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
moe_layer_freq = getattr(config, "moe_layer_freq", 1)
# DecoderLayers are created with `make_layers` which passes the prefix
@ -1107,8 +1092,6 @@ class DeepseekV2DecoderLayer(nn.Module):
v_head_dim=v_head_dim,
q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None,
kv_lora_rank=kv_lora_rank,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,

View File

@ -27,7 +27,6 @@
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
@ -202,8 +201,6 @@ class Dots1Attention(nn.Module):
num_heads: int,
num_kv_heads: int,
config: Dots1Config,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
@ -229,7 +226,6 @@ class Dots1Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
attention_bias = config.attention_bias
@ -255,8 +251,7 @@ class Dots1Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
)
self.attn = Attention(
self.num_heads,
@ -296,8 +291,6 @@ class Dots1DecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
layer_idx = int(prefix.split(sep=".")[-1])
self.layer_idx = layer_idx
@ -307,8 +300,6 @@ class Dots1DecoderLayer(nn.Module):
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
config=config,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,

View File

@ -62,6 +62,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name,
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
from .utils import (
@ -232,9 +233,8 @@ class Ernie4_5_MoeAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_parameters: dict[str, Any],
head_dim: int | None = None,
rope_theta: float = 500000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 131072,
rms_norm_eps: float = 1e-05,
qkv_bias: bool = False,
@ -266,7 +266,6 @@ class Ernie4_5_MoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -291,9 +290,8 @@ class Ernie4_5_MoeAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_parameters=rope_parameters,
is_neox_style=False,
rope_scaling=rope_scaling,
)
self.attn = Attention(
self.num_heads,
@ -333,16 +331,14 @@ class Ernie4_5_MoeDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 500000)
rope_scaling = getattr(config, "rope_scaling", None)
set_default_rope_theta(config, default_theta=500000)
max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
self.self_attn = Ernie4_5_MoeAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
head_dim=getattr(config, "head_dim", None),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
rms_norm_eps=config.rms_norm_eps,
qkv_bias=getattr(config, "use_bias", False),

View File

@ -58,6 +58,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name,
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .ernie45_moe import Ernie4_5_MoeMLP
from .interfaces import SupportsPP
@ -91,9 +92,8 @@ class Ernie4_5_VLMoeAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_parameters: dict[str, Any],
head_dim: int | None = None,
rope_theta: float = 500000,
rope_scaling: dict[str, Any] | None = None,
freq_allocation: int = 20,
max_position_embeddings: int = 131072,
rms_norm_eps: float = 1e-05,
@ -126,7 +126,6 @@ class Ernie4_5_VLMoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -155,7 +154,7 @@ class Ernie4_5_VLMoeAttention(nn.Module):
head_size=self.head_dim,
rotary_dim=self.head_dim,
max_position_embeddings=max_position_embeddings,
base=rope_theta,
base=rope_parameters["rope_theta"],
is_neox_style=False,
dtype=torch.get_default_dtype(),
mrope_section=[h_rope, w_rope, t_rope],
@ -413,8 +412,7 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 500000)
rope_scaling = getattr(config, "rope_scaling", None)
set_default_rope_theta(config, default_theta=500000)
freq_allocation = getattr(config, "freq_allocation", 20)
max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
@ -423,8 +421,7 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module):
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
head_dim=getattr(config, "head_dim", None),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
freq_allocation=freq_allocation,
max_position_embeddings=max_position_embeddings,
rms_norm_eps=config.rms_norm_eps,

View File

@ -27,7 +27,6 @@
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
@ -113,8 +112,6 @@ class ExaoneAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
@ -144,7 +141,6 @@ class ExaoneAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -173,8 +169,7 @@ class ExaoneAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=is_neox_style,
)
self.attn = Attention(
@ -207,8 +202,6 @@ class ExaoneBlockAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
@ -221,8 +214,6 @@ class ExaoneBlockAttention(nn.Module):
hidden_size=hidden_size,
num_heads=num_heads,
num_kv_heads=num_kv_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=bias,
@ -251,14 +242,6 @@ class ExaoneDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias
@ -272,8 +255,6 @@ class ExaoneDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,

View File

@ -23,7 +23,6 @@
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
@ -52,6 +51,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name,
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .interfaces import SupportsLoRA, SupportsPP
from .utils import (
@ -110,8 +110,6 @@ class Exaone4Attention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 1000000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
@ -141,7 +139,6 @@ class Exaone4Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -176,12 +173,12 @@ class Exaone4Attention(nn.Module):
# apply rotary embeddings to every layer in full attention models
self.apply_rope_all_layers = "sliding_attention" not in config.layer_types
set_default_rope_theta(config, default_theta=1000000)
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=is_neox_style,
)
self.attn = Attention(
@ -227,14 +224,6 @@ class Exaone4DecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias
@ -249,8 +238,6 @@ class Exaone4DecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,

View File

@ -164,13 +164,12 @@ class FalconAttention(nn.Module):
)
if self.use_rotary:
rope_theta = getattr(config, "rope_theta", 10000)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_parameters=config.rope_parameters,
)
self.attn = Attention(
self.num_heads,

View File

@ -35,6 +35,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .interfaces import (
HasInnerState,
@ -214,8 +215,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
prefix: str = "",
) -> None:
super().__init__()
rope_theta = getattr(config, "rope_theta", 1e11)
rope_scaling = getattr(config, "rope_scaling", None)
set_default_rope_theta(config, default_theta=1e11)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.hidden_size = config.hidden_size
tp_size = get_tensor_model_parallel_world_size()
@ -240,7 +240,6 @@ class FalconH1AttentionDecoderLayer(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
if hasattr(config, "partial_rotary_factor"):
@ -254,8 +253,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
head_size=self.head_dim,
rotary_dim=rotary_dim,
max_position=max_position_embeddings,
rope_scaling=rope_scaling,
base=rope_theta,
rope_parameters=config.rope_parameters,
is_neox_style=True,
dtype=None, # see impl of get_rope
)

View File

@ -20,6 +20,7 @@
from collections.abc import Iterable
from functools import cache
from itertools import islice
from typing import Any
import torch
from torch import nn
@ -127,8 +128,8 @@ class GemmaAttention(nn.Module):
num_heads: int,
num_kv_heads: int,
head_dim: int,
rope_parameters: dict[str, Any],
max_position_embeddings: int = 8192,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
prefix: str = "",
@ -153,7 +154,6 @@ class GemmaAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear(
hidden_size,
@ -176,7 +176,7 @@ class GemmaAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=self.rope_theta,
rope_parameters=rope_parameters,
is_neox_style=True,
)
self.attn = Attention(
@ -218,7 +218,7 @@ class GemmaDecoderLayer(nn.Module):
num_kv_heads=config.num_key_value_heads,
head_dim=config.head_dim,
max_position_embeddings=config.max_position_embeddings,
rope_theta=config.rope_theta,
rope_parameters=config.rope_parameters,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.self_attn",

View File

@ -107,7 +107,6 @@ class Gemma2Attention(nn.Module):
num_kv_heads: int,
head_dim: int,
max_position_embeddings: int,
rope_theta: float,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
attn_logits_soft_cap: float | None = None,
@ -134,7 +133,6 @@ class Gemma2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = config.query_pre_attn_scalar**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear(
hidden_size,
@ -156,7 +154,7 @@ class Gemma2Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=self.rope_theta,
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
@ -206,7 +204,6 @@ class Gemma2DecoderLayer(nn.Module):
num_kv_heads=config.num_key_value_heads,
head_dim=config.head_dim,
max_position_embeddings=config.max_position_embeddings,
rope_theta=config.rope_theta,
cache_config=cache_config,
quant_config=quant_config,
attn_logits_soft_cap=config.attn_logit_softcapping,

View File

@ -155,25 +155,28 @@ class Gemma3Attention(nn.Module):
self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
layer_idx = extract_layer_index(prefix)
self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
layer_type = config.layer_types[layer_idx]
self.is_sliding = layer_type == "sliding_attention"
sliding_window = config.sliding_window if self.is_sliding else None
# Initialize the rotary embedding.
if self.is_sliding:
# Local attention. Override the values in config.json.
self.rope_theta = config.rope_local_base_freq
self.rope_scaling = {"rope_type": "default"}
if layer_type in config.rope_parameters:
# Transformers v5 rope config.
rope_parameters = config.rope_parameters[layer_type]
else:
# Transformers v4 rope config.
# Global attention. Use the values in config.json.
self.rope_theta = config.rope_theta
self.rope_scaling = config.rope_scaling
rope_parameters = config.rope_parameters.copy()
# Local attention. Override the values in config.json.
if self.is_sliding:
rope_parameters["rope_theta"] = config.rope_local_base_freq
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=self.rope_theta,
rope_parameters=rope_parameters,
is_neox_style=True,
rope_scaling=self.rope_scaling,
)
if getattr(config, "is_causal", True):

View File

@ -332,18 +332,21 @@ class Gemma3nAttention(nn.Module):
)
layer_idx = extract_layer_index(prefix)
is_sliding = config.layer_types[layer_idx] == "sliding_attention"
layer_type = config.layer_types[layer_idx]
is_sliding = layer_type == "sliding_attention"
self.sliding_window = config.sliding_window if is_sliding else None
# Initialize the rotary embedding.
if is_sliding:
# Local attention. Override the values in config.json.
rope_theta = config.rope_local_base_freq
rope_scaling = {"rope_type": "default"}
if layer_type in config.rope_parameters:
# Transformers v5 rope config.
rope_parameters = config.rope_parameters[layer_type]
else:
# Transformers v4 rope config.
# Global attention. Use the values in config.json.
rope_theta = config.rope_theta
rope_scaling = config.rope_scaling
rope_parameters = config.rope_parameters.copy()
# Local attention. Override the values in config.json.
if is_sliding:
rope_parameters["rope_theta"] = config.rope_local_base_freq
first_kv_shared_layer_idx = (
config.num_hidden_layers - config.num_kv_shared_layers
@ -383,9 +386,8 @@ class Gemma3nAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_parameters=rope_parameters,
is_neox_style=True,
rope_scaling=rope_scaling,
)
self.attn = Attention(

View File

@ -57,10 +57,8 @@ class Glm4Attention(nn.Module):
max_position: int = 4096 * 32,
head_dim: int | None = None,
qkv_bias: bool = False,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
rope_scaling: tuple | None = None,
prefix: str = "",
attn_type: str = AttentionType.DECODER,
) -> None:
@ -86,7 +84,6 @@ class Glm4Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear(
hidden_size,
self.head_dim,
@ -107,8 +104,7 @@ class Glm4Attention(nn.Module):
self.head_dim,
rotary_dim=self.rotary_dim,
max_position=max_position,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
partial_rotary_factor=partial_rotary_factor,
is_neox_style=False,
)
@ -150,8 +146,6 @@ class Glm4DecoderLayer(nn.Module):
quant_config = vllm_config.quant_config
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
self.self_attn = Glm4Attention(
config=config,
@ -159,12 +153,10 @@ class Glm4DecoderLayer(nn.Module):
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
qkv_bias=getattr(config, "attention_bias", False),
head_dim=getattr(config, "head_dim", None),
cache_config=cache_config,
quant_config=quant_config,
rope_scaling=rope_scaling,
prefix=f"{prefix}.self_attn",
attn_type=AttentionType.DECODER,
)

View File

@ -703,7 +703,6 @@ class Glm4vVisionTransformer(nn.Module):
head_size=head_dim,
rotary_dim=head_dim // 2,
max_position=8192,
base=10000.0,
is_neox_style=True,
)
self.blocks = nn.ModuleList(

View File

@ -26,7 +26,6 @@
import typing
from collections.abc import Callable, Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
@ -233,8 +232,6 @@ class Glm4MoeAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 131072,
head_dim: int | None = None,
rms_norm_eps: float = 1e-05,
@ -264,7 +261,6 @@ class Glm4MoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.use_qk_norm = use_qk_norm
@ -291,8 +287,7 @@ class Glm4MoeAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
partial_rotary_factor=partial_rotary_factor,
)
self.attn = Attention(
@ -341,8 +336,6 @@ class Glm4MoeDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
# DecoderLayers are created with `make_layers` which passes the prefix
# with the layer's index.
@ -354,8 +347,6 @@ class Glm4MoeDecoderLayer(nn.Module):
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
head_dim=config.head_dim,
rms_norm_eps=config.rms_norm_eps,

View File

@ -95,13 +95,12 @@ class GPTJAttention(nn.Module):
scaling = self.head_size**-0.5
assert getattr(config, "rotary", True)
assert config.rotary_dim % 2 == 0
rope_theta = getattr(config, "rope_theta", 10000)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.rotary_emb = get_rope(
self.head_size,
rotary_dim=config.rotary_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_parameters=config.rope_parameters,
is_neox_style=False,
)
self.attn = Attention(

View File

@ -92,13 +92,12 @@ class GPTNeoXAttention(nn.Module):
scaling = self.head_size**-0.5
rotary_dim = int(self.head_size * config.rotary_pct)
assert rotary_dim % 2 == 0
rope_theta = getattr(config, "rope_theta", 10000)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.rotary_emb = get_rope(
self.head_size,
rotary_dim=rotary_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_parameters=config.rope_parameters,
)
self.attn = Attention(
self.num_heads,

View File

@ -67,16 +67,16 @@ class OAIAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=config.max_position_embeddings,
base=config.rope_theta,
dtype=torch.float32,
rope_scaling={
rope_parameters={
"rope_theta": config.rope_parameters["rope_theta"],
"rope_type": "yarn",
"factor": config.rope_scaling["factor"],
"original_max_position_embeddings": config.rope_scaling[
"factor": config.rope_parameters["factor"],
"original_max_position_embeddings": config.rope_parameters[
"original_max_position_embeddings"
],
"beta_fast": config.rope_scaling["beta_fast"],
"beta_slow": config.rope_scaling["beta_slow"],
"beta_fast": config.rope_parameters["beta_fast"],
"beta_slow": config.rope_parameters["beta_slow"],
},
is_neox_style=True,
)
@ -90,7 +90,6 @@ class OAIAttention(nn.Module):
self.q_size = self.num_attention_heads * self.head_dim // tp_size
self.kv_size = self.num_key_value_heads * self.head_dim // tp_size
self.scaling = self.head_dim**-0.5
self.rope_theta = config.rope_theta
self.qkv_proj = QKVParallelLinear(
hidden_size=self.hidden_size,

View File

@ -26,7 +26,6 @@
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
@ -112,8 +111,6 @@ class GraniteAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
@ -143,7 +140,6 @@ class GraniteAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = config.attention_multiplier
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -167,8 +163,7 @@ class GraniteAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
)
self.attn = Attention(
self.num_heads,
@ -204,14 +199,6 @@ class GraniteDecoderLayer(nn.Module):
super().__init__()
self.hidden_size = config.hidden_size
self.residual_multiplier = config.residual_multiplier
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias
@ -225,8 +212,6 @@ class GraniteDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,

View File

@ -141,8 +141,7 @@ class GraniteMoeAttention(nn.Module):
num_heads: int,
num_kv_heads: int,
max_position: int = 4096 * 32,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any] | None = None,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
attention_multiplier: float | None = None,
@ -172,7 +171,6 @@ class GraniteMoeAttention(nn.Module):
if attention_multiplier is not None
else self.head_dim**-1
)
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear(
hidden_size,
@ -194,9 +192,8 @@ class GraniteMoeAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position,
base=int(self.rope_theta),
rope_parameters=rope_parameters,
is_neox_style=True,
rope_scaling=rope_scaling,
)
self.attn = Attention(
self.num_heads,
@ -235,16 +232,12 @@ class GraniteMoeDecoderLayer(nn.Module):
parallel_config = vllm_config.parallel_config
self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
self.self_attn = GraniteMoeAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.self_attn",

View File

@ -273,10 +273,7 @@ class GraniteMoeHybridAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=config.max_position_embeddings,
base=int(config.rope_theta),
rope_scaling=config.rope_scaling
if hasattr(config, "rope_scaling") and config.rope_scaling is not None
else None,
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
else:

View File

@ -84,16 +84,12 @@ class GraniteMoeSharedDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
self.self_attn = GraniteMoeAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.self_attn",

View File

@ -25,6 +25,7 @@
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
import torch.nn.functional as F
@ -134,7 +135,7 @@ class Grok1Attention(nn.Module):
num_heads: int,
num_kv_heads: int,
max_position: int = 4096 * 32,
rope_theta: float = 10000,
rope_parameters: dict[str, Any] | None = None,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
prefix: str = "",
@ -161,7 +162,6 @@ class Grok1Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear(
hidden_size,
@ -183,7 +183,7 @@ class Grok1Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position,
base=int(self.rope_theta),
rope_parameters=rope_parameters,
is_neox_style=True,
)
@ -234,15 +234,12 @@ class Grok1DecoderLayer(nn.Module):
if not self.use_fp8 and hasattr(quant_config, "is_fp8"):
self.use_fp8 = quant_config.is_fp8
# Requires transformers > 4.32.0
# Default rope_theta value if not in config
rope_theta = 10000
self.attn = Grok1Attention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_parameters=config.rope_parameters,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.attn",

View File

@ -27,7 +27,6 @@
import typing
from collections.abc import Callable, Iterable
from itertools import islice
from typing import Any
import regex as re
import torch
@ -142,8 +141,6 @@ class HunYuanAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
@ -177,7 +174,6 @@ class HunYuanAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.use_qk_norm = getattr(config, "use_qk_norm", False)
self.layer_id = layer_id
@ -204,8 +200,7 @@ class HunYuanAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
self.attn = Attention(
@ -254,8 +249,6 @@ class HunYuanCrossAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
@ -289,7 +282,6 @@ class HunYuanCrossAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.use_qk_norm = getattr(config, "use_qk_norm", False)
self.layer_id = layer_id
@ -314,8 +306,7 @@ class HunYuanCrossAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
self.attn = Attention(
@ -494,14 +485,6 @@ class HunYuanDecoderLayer(nn.Module):
if isinstance(config.intermediate_size, int)
else config.intermediate_size[layer_id]
)
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
attention_bias = getattr(config, "attention_bias", False) or getattr(
config, "bias", False
@ -520,8 +503,6 @@ class HunYuanDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,
@ -537,8 +518,6 @@ class HunYuanDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,

View File

@ -91,8 +91,7 @@ class InternLM2Attention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
@ -120,7 +119,6 @@ class InternLM2Attention(nn.Module):
self.kv_size = self.num_kv_heads * self.head_dim
self.key_value_groups = int(self.num_heads / self.num_kv_heads)
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.wqkv = QKVParallelLinear(
@ -144,8 +142,7 @@ class InternLM2Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
)
self.attn = Attention(
self.num_heads,
@ -204,15 +201,12 @@ class InternLMDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.attention = InternLM2Attention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,

View File

@ -30,15 +30,12 @@ class InternLM2VEDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.attention = InternLM2Attention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from typing import Any
import torch
from torch import nn
@ -190,9 +189,7 @@ class KimiMLAAttention(nn.Module):
v_head_dim: int,
q_lora_rank: int | None,
kv_lora_rank: int,
rope_theta: float = 10000,
use_nope: bool = False,
rope_scaling: dict[str, Any] | None = None,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
prefix: str = "",
@ -210,11 +207,9 @@ class KimiMLAAttention(nn.Module):
tp_size = get_tensor_model_parallel_world_size()
self.num_local_heads = num_heads // tp_size
self.scaling = self.qk_head_dim**-0.5
self.rope_theta = rope_theta
self.use_nope = use_nope
assert self.use_nope is True
assert self.q_lora_rank is None
assert rope_scaling is None
assert num_heads % tp_size == 0
self.kv_a_proj_with_mqa = ReplicatedLinear(
self.hidden_size,

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
import torch.nn as nn
@ -96,8 +95,6 @@ class Lfm2Attention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
@ -126,7 +123,6 @@ class Lfm2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -149,8 +145,7 @@ class Lfm2Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
self.attn = Attention(
@ -199,14 +194,6 @@ class Lfm2AttentionDecoderLayer(nn.Module):
self.config = config
self.layer_idx = layer_idx
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.self_attn = Lfm2Attention(
@ -215,8 +202,6 @@ class Lfm2AttentionDecoderLayer(nn.Module):
hidden_size=config.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
import torch.nn as nn
@ -189,8 +188,6 @@ class Lfm2MoeAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
@ -219,7 +216,6 @@ class Lfm2MoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -242,8 +238,7 @@ class Lfm2MoeAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
self.attn = Attention(
@ -293,14 +288,6 @@ class Lfm2MoeAttentionDecoderLayer(nn.Module):
self.config = config
self.layer_idx = layer_idx
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.self_attn = Lfm2MoeAttention(
@ -309,8 +296,6 @@ class Lfm2MoeAttentionDecoderLayer(nn.Module):
hidden_size=config.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,

View File

@ -26,7 +26,6 @@
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
@ -120,8 +119,6 @@ class LlamaAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
@ -157,7 +154,6 @@ class LlamaAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
llama_4_scaling_config = getattr(config, "llama_4_scaling", None)
@ -186,9 +182,7 @@ class LlamaAttention(nn.Module):
prefix=f"{prefix}.o_proj",
)
self._init_rotary_emb(
config, rope_scaling=rope_scaling, quant_config=quant_config
)
self._init_rotary_emb(config, quant_config=quant_config)
sliding_window = None
if layer_types := getattr(config, "layer_types", None):
@ -258,7 +252,6 @@ class LlamaAttention(nn.Module):
def _init_rotary_emb(
self,
config: LlamaConfig,
rope_scaling: dict[str, Any] | None,
quant_config: QuantizationConfig | None,
) -> None:
is_neox_style = True
@ -270,8 +263,7 @@ class LlamaAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor,
)
@ -291,14 +283,6 @@ class LlamaDecoderLayer(nn.Module):
quant_config = self.get_quant_config(vllm_config)
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias
@ -326,8 +310,6 @@ class LlamaDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,

View File

@ -19,7 +19,6 @@
"""Inference-only LLaMA model compatible with HuggingFace weights."""
from collections.abc import Iterable
from typing import Any
import torch
from torch import nn
@ -171,8 +170,6 @@ class Llama4Attention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
@ -208,7 +205,6 @@ class Llama4Attention(nn.Module):
self.floor_scale = getattr(config, "floor_scale", 8192.0)
self.attn_scale = getattr(config, "attn_scale", 0.1)
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.n_rep = self.num_heads // self.num_kv_heads
self.qk_norm = (
@ -248,8 +244,7 @@ class Llama4Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=int(rope_theta),
rope_scaling=rope_scaling if rope_scaling != "default" else None,
rope_parameters=config.rope_parameters,
is_neox_style=is_neox_style,
)
if not self.nope
@ -331,8 +326,6 @@ class Llama4DecoderLayer(nn.Module):
self.layer_idx = extract_layer_index(prefix)
self.global_layer = config.no_rope_layers[self.layer_idx] == 0
self.hidden_size = config.hidden_size
rope_theta = config.rope_theta
rope_scaling = config.rope_scaling
max_position_embeddings = config.max_position_embeddings
self.self_attn = Llama4Attention(
@ -340,8 +333,6 @@ class Llama4DecoderLayer(nn.Module):
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=False,

View File

@ -108,8 +108,7 @@ class FlashConfig(PretrainedConfig):
eos_token_id=100001,
pretraining_tp=1,
tie_word_embeddings=False,
rope_theta=1000000.0,
rope_scaling=None,
rope_parameters=None,
attention_bias=False,
attention_dropout=0.0,
mla_scale_q_lora=False,
@ -162,8 +161,13 @@ class FlashConfig(PretrainedConfig):
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 1000000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.mla_scale_q_lora = mla_scale_q_lora
@ -336,15 +340,7 @@ class FlashDecoderLayer(nn.Module):
super().__init__()
self.layer_idx = int(prefix.split(sep=".")[-1])
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
# Dual attention structure
self.self_attn = nn.ModuleList(
@ -361,8 +357,6 @@ class FlashDecoderLayer(nn.Module):
config.q_lora_rank if hasattr(config, "q_lora_rank") else None
),
kv_lora_rank=config.kv_lora_rank,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=None

View File

@ -230,8 +230,7 @@ class MiniCPMAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
@ -257,7 +256,6 @@ class MiniCPMAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -281,8 +279,7 @@ class MiniCPMAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
)
self.attn = Attention(
@ -324,8 +321,6 @@ class MiniCPMDecoderLayer(nn.Module):
self.cache_config = cache_config
self.quant_config = quant_config
self.hidden_size = config.hidden_size
self.rope_theta = getattr(config, "rope_theta", 10000)
self.rope_scaling = getattr(config, "rope_scaling", None)
self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.prefix = prefix
self._init_attn_block()
@ -339,8 +334,7 @@ class MiniCPMDecoderLayer(nn.Module):
hidden_size=self.hidden_size,
num_heads=self.config.num_attention_heads,
num_kv_heads=self.config.num_key_value_heads,
rope_theta=self.rope_theta,
rope_scaling=self.rope_scaling,
rope_parameters=self.config.rope_parameters,
max_position_embeddings=self.max_position_embeddings,
cache_config=self.cache_config,
quant_config=self.quant_config,

View File

@ -25,8 +25,6 @@
# limitations under the License.
"""Inference-only MiniCPM3 model compatible with HuggingFace weights."""
from typing import Any
import torch
from torch import nn
from transformers import PretrainedConfig
@ -62,8 +60,6 @@ class MiniCPM3Attention(nn.Module):
v_head_dim: int,
q_lora_rank: int,
kv_lora_rank: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
@ -84,7 +80,6 @@ class MiniCPM3Attention(nn.Module):
self.num_local_heads = num_heads // tp_size
self.scaling = self.qk_head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.q_a_proj = ReplicatedLinear(
@ -127,8 +122,7 @@ class MiniCPM3Attention(nn.Module):
self.qk_rope_head_dim,
rotary_dim=self.qk_rope_head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
)
self.attn = Attention(
self.num_local_heads,
@ -204,8 +198,6 @@ class MiniCPM3DecoderLayer(MiniCPMDecoderLayer):
v_head_dim=self.config.v_head_dim,
q_lora_rank=self.config.q_lora_rank,
kv_lora_rank=self.config.kv_lora_rank,
rope_theta=self.rope_theta,
rope_scaling=self.rope_scaling,
max_position_embeddings=self.max_position_embeddings,
cache_config=self.cache_config,
quant_config=self.quant_config,

View File

@ -69,8 +69,6 @@ class EagleMiniCPMDecoderLayer(nn.Module):
self.cache_config = cache_config
self.quant_config = quant_config
self.hidden_size = config.hidden_size
self.rope_theta = getattr(config, "rope_theta", 10000)
self.rope_scaling = getattr(config, "rope_scaling", None)
self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.prefix = prefix
self._init_attn_block()
@ -84,8 +82,7 @@ class EagleMiniCPMDecoderLayer(nn.Module):
hidden_size=self.hidden_size,
num_heads=self.config.num_attention_heads,
num_kv_heads=self.config.num_key_value_heads,
rope_theta=self.rope_theta,
rope_scaling=self.rope_scaling,
rope_parameters=self.config.rope_parameters,
max_position_embeddings=self.max_position_embeddings,
cache_config=self.cache_config,
quant_config=self.quant_config,

View File

@ -149,8 +149,7 @@ class MiniMaxM2Attention(nn.Module):
num_heads: int,
num_kv_heads: int,
rotary_dim: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any] | None = None,
attn_window_size: int | None = None,
max_position_embeddings: int = 8192,
head_dim: int | None = None,
@ -180,7 +179,6 @@ class MiniMaxM2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -205,8 +203,7 @@ class MiniMaxM2Attention(nn.Module):
self.head_dim,
rotary_dim=rotary_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
)
self.attn = Attention(
self.num_heads,
@ -252,8 +249,6 @@ class MiniMaxM2DecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int):
max_position_embeddings = max(
@ -269,8 +264,7 @@ class MiniMaxM2DecoderLayer(nn.Module):
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rotary_dim=config.rotary_dim,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
rms_norm_eps=config.rms_norm_eps,
qkv_bias=getattr(config, "attention_bias", False),

View File

@ -188,7 +188,7 @@ class MiniMaxText01Attention(nn.Module):
num_kv_heads: int,
rotary_dim: int,
max_position: int = 4096 * 32,
rope_theta: float = 10000,
rope_parameters: dict | None = None,
sliding_window: int | None = None,
quant_config: QuantizationConfig | None = None,
layer_idx: int = None,
@ -214,7 +214,6 @@ class MiniMaxText01Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.sliding_window = sliding_window
self.prefix = prefix
@ -247,7 +246,7 @@ class MiniMaxText01Attention(nn.Module):
head_size=self.head_dim,
rotary_dim=rotary_dim,
max_position=max_position,
base=int(rope_theta),
rope_parameters=rope_parameters,
is_neox_style=True,
dtype=torch.float32,
)
@ -287,8 +286,6 @@ class MiniMaxText01DecoderLayer(nn.Module):
self.hidden_size = config.hidden_size
self.expert_num = expert_num
rope_theta = getattr(config, "rope_theta", 10000)
head_dim = getattr(config, "head_dim", None)
if head_dim is None:
head_dim = config.hidden_size // config.num_attention_heads
@ -328,7 +325,7 @@ class MiniMaxText01DecoderLayer(nn.Module):
else head_dim,
num_kv_heads=config.num_key_value_heads,
max_position=max_position_embeddings,
rope_theta=rope_theta,
rope_parameters=config.rope_parameters,
sliding_window=config.sliding_window,
quant_config=quant_config,
layer_idx=self._ilayer,

View File

@ -161,7 +161,6 @@ class MixtralAttention(nn.Module):
num_heads: int,
num_kv_heads: int,
max_position: int = 4096 * 32,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
prefix: str = "",
@ -189,7 +188,6 @@ class MixtralAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear(
hidden_size,
@ -211,7 +209,7 @@ class MixtralAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position,
base=int(self.rope_theta),
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
self.attn = Attention(
@ -248,15 +246,12 @@ class MixtralDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 10000)
self.self_attn = MixtralAttention(
config=config,
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.self_attn",

View File

@ -292,13 +292,17 @@ class Llama4VisionAttention(nn.Module):
prefix=f"{prefix}.o_proj",
)
rope_parameters = {
"rope_type": "mllama4",
"rope_theta": config.rope_parameters["rope_theta"],
}
self.rotary_emb = get_rope(
head_size=self.head_dim,
rotary_dim=config.hidden_size // config.num_attention_heads // 2,
# number of image patches
max_position=(config.image_size // config.patch_size) ** 2,
base=config.rope_theta,
rope_scaling={"rope_type": "mllama4"},
rope_parameters=rope_parameters,
is_neox_style=False,
dtype=torch.complex64, # important
)

View File

@ -410,7 +410,6 @@ class MolmoAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
# Attention input projection. Projects x -> (q, k, v)
self.qkv_proj = QKVParallelLinear(
@ -437,7 +436,7 @@ class MolmoAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_parameters=config.rope_parameters,
)
self.scaling = self.head_dim**-0.5
self.attn = Attention(

View File

@ -26,7 +26,6 @@
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
@ -150,8 +149,6 @@ class NemotronAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
@ -181,7 +178,6 @@ class NemotronAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.partial_rotary_factor = config.partial_rotary_factor
self.max_position_embeddings = max_position_embeddings
@ -206,8 +202,7 @@ class NemotronAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
partial_rotary_factor=self.partial_rotary_factor,
)
self.attn = Attention(
@ -243,14 +238,6 @@ class NemotronDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias
@ -264,8 +251,6 @@ class NemotronDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,

View File

@ -26,7 +26,6 @@
from collections.abc import Iterable
from itertools import islice
from typing import Any
import torch
from torch import nn
@ -82,8 +81,6 @@ class DeciLMAttention(LlamaAttention):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
@ -97,8 +94,6 @@ class DeciLMAttention(LlamaAttention):
hidden_size,
num_heads,
num_kv_heads,
rope_theta,
rope_scaling,
max_position_embeddings,
quant_config,
bias,
@ -111,7 +106,6 @@ class DeciLMAttention(LlamaAttention):
def _init_rotary_emb(
self,
config,
rope_scaling: dict[str, Any] | None,
quant_config: QuantizationConfig | None,
) -> None:
# Enables YARN for Mistral and LLaMA4 derivatives.
@ -126,8 +120,7 @@ class DeciLMAttention(LlamaAttention):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor,
)
@ -148,14 +141,6 @@ class DeciLMDecoderLayer(nn.Module):
self._is_no_op_ffn = block_config.ffn.no_op
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias
@ -176,8 +161,6 @@ class DeciLMDecoderLayer(nn.Module):
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=num_kv_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,

View File

@ -87,7 +87,6 @@ class OlmoAttention(nn.Module):
self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
self.head_dim = self.hidden_size // self.total_num_heads
self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
self.clip_qkv = config.clip_qkv
# Attention input projection. Projects x -> (q, k, v)
@ -105,7 +104,7 @@ class OlmoAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_parameters=config.rope_parameters,
)
self.scaling = self.head_dim**-0.5
self.attn = Attention(

View File

@ -99,7 +99,6 @@ class Olmo2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.max_position_embeddings = self.config.max_position_embeddings
self.rope_theta = self.config.rope_theta
# Attention input projection. Projects x -> (q, k, v)
self.qkv_proj = QKVParallelLinear(
@ -139,15 +138,17 @@ class Olmo2Attention(nn.Module):
prefix=f"{prefix}.attn",
)
# Rotary embeddings. Rope scaling is only applied on full attention
# layers.
self.rope_scaling = self.config.rope_scaling if sliding_window is None else None
# Rotary embeddings. Rope scaling is only applied on full attention layers.
if sliding_window is None:
rope_parameters = self.config.rope_parameters
else:
rope_theta = self.config.rope_parameters["rope_theta"]
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta, # type: ignore
rope_scaling=self.rope_scaling,
rope_parameters=rope_parameters,
)
# Attention output projection.

View File

@ -123,8 +123,6 @@ class OlmoeAttention(nn.Module):
quant_config = vllm_config.quant_config
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
num_heads = config.num_attention_heads
@ -148,7 +146,6 @@ class OlmoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -176,8 +173,7 @@ class OlmoeAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
self.attn = Attention(

View File

@ -77,6 +77,7 @@ from vllm.model_executor.models.utils import (
sequence_parallel_chunk,
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
def check_ffn_act_fn(act_fn: str):
@ -259,7 +260,6 @@ class OpenPanguMLAAttention(nn.Module):
v_head_dim: int,
q_lora_rank: int | None,
kv_lora_rank: int,
rope_theta: float = 10000,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
@ -274,8 +274,6 @@ class OpenPanguMLAAttention(nn.Module):
self.v_head_dim = v_head_dim
self.q_lora_rank = q_lora_rank
self.kv_lora_rank = kv_lora_rank
self.rope_theta = rope_theta
self.tp_size = get_tensor_model_parallel_world_size()
if num_heads % self.tp_size != 0:
raise ValueError(
@ -339,7 +337,9 @@ class OpenPanguMLAAttention(nn.Module):
)
# TODO: remove hard coding
rope_scaling = {
set_default_rope_theta(config, default_theta=10000)
rope_parameters = {
"rope_theta": config.rope_parameters["rope_theta"],
"beta_fast": 32,
"beta_slow": 1,
"factor": 1,
@ -353,8 +353,7 @@ class OpenPanguMLAAttention(nn.Module):
qk_rope_head_dim,
rotary_dim=qk_rope_head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
is_neox_style=False,
)
@ -407,8 +406,6 @@ class OpenPanguEmbeddedAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
@ -454,7 +451,6 @@ class OpenPanguEmbeddedAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -475,9 +471,7 @@ class OpenPanguEmbeddedAttention(nn.Module):
prefix=f"{prefix}.o_proj",
)
self._init_rotary_emb(
config, rope_scaling=rope_scaling, quant_config=quant_config
)
self._init_rotary_emb(config, quant_config=quant_config)
if hasattr(config, "interleaved_sliding_window"):
interleaved_sliding_window = config.interleaved_sliding_window
@ -521,7 +515,6 @@ class OpenPanguEmbeddedAttention(nn.Module):
def _init_rotary_emb(
self,
config: PretrainedConfig,
rope_scaling: dict[str, Any] | None,
quant_config: QuantizationConfig | None,
) -> None:
is_neox_style = True
@ -533,8 +526,7 @@ class OpenPanguEmbeddedAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
is_neox_style=is_neox_style,
)
@ -555,7 +547,6 @@ class OpenPanguDecoderLayer(nn.Module):
parallel_config = vllm_config.parallel_config
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
layer_idx = int(prefix.split(sep=".")[-1])
@ -579,7 +570,6 @@ class OpenPanguDecoderLayer(nn.Module):
config.q_lora_rank if hasattr(config, "q_lora_rank") else None
),
kv_lora_rank=config.kv_lora_rank,
rope_theta=rope_theta,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,
@ -607,8 +597,6 @@ class OpenPanguDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=getattr(config, "rope_scaling", None),
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,

View File

@ -88,8 +88,7 @@ class OrionAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
@ -115,7 +114,6 @@ class OrionAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -139,8 +137,7 @@ class OrionAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
)
self.attn = Attention(
self.num_heads,
@ -175,15 +172,12 @@ class OrionDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.self_attn = OrionAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,

View File

@ -112,10 +112,8 @@ class OuroAttention(nn.Module):
num_heads: int,
num_kv_heads: int,
max_position: int = 4096 * 32,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
rope_scaling: tuple | None = None,
prefix: str = "",
attn_type: str = AttentionType.DECODER,
dual_chunk_attention_config: dict[str, Any] | None = None,
@ -140,7 +138,6 @@ class OuroAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.dual_chunk_attention_config = dual_chunk_attention_config
# Get total_ut_steps from config, default to 4 if not specified
@ -170,8 +167,7 @@ class OuroAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
dual_chunk_attention_config=dual_chunk_attention_config,
)
self.attn = nn.ModuleList()
@ -226,9 +222,6 @@ class OuroDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
dual_chunk_attention_config = getattr(
config, "dual_chunk_attention_config", None
)
@ -244,10 +237,8 @@ class OuroDecoderLayer(nn.Module):
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
cache_config=cache_config,
quant_config=quant_config,
rope_scaling=rope_scaling,
prefix=f"{prefix}.self_attn",
attn_type=attn_type,
dual_chunk_attention_config=dual_chunk_attention_config,

View File

@ -106,7 +106,6 @@ class PersimmonAttention(nn.Module):
self.num_heads = self.total_num_heads // tensor_parallel_world_size
self.head_dim = self.hidden_size // self.total_num_heads
self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
self.partial_rotary_factor = config.partial_rotary_factor
self.is_causal = True
@ -138,7 +137,7 @@ class PersimmonAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_parameters=config.rope_parameters,
partial_rotary_factor=self.partial_rotary_factor,
)
self.scaling = self.head_dim**-0.5

View File

@ -115,16 +115,12 @@ class PhiAttention(nn.Module):
)
assert rotary_dim % 2 == 0
# pylint: disable=C0301
# Refer to:
# https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518
rope_theta = getattr(config, "rope_theta", 10000.0)
max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
self.rotary_emb = get_rope(
self.head_size,
rotary_dim=rotary_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_parameters=config.rope_parameters,
)
self.attn = Attention(
self.num_heads,

View File

@ -86,7 +86,7 @@ class PhiMoEConfig(PretrainedConfig):
bos_token_id=1,
eos_token_id=2,
tie_word_embeddings=False,
rope_theta=1e6,
rope_parameters=None,
sliding_window=None,
attention_dropout=0.0,
num_experts_per_tok=2,
@ -119,7 +119,9 @@ class PhiMoEConfig(PretrainedConfig):
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
if rope_parameters is None:
rope_theta = kwargs.pop("rope_theta", 1e6)
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.attention_dropout = attention_dropout
self.num_experts_per_tok = num_experts_per_tok
@ -302,12 +304,11 @@ class PhiMoEAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_parameters: dict,
head_dim: int | None = None,
max_position: int = 4096 * 32,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
rope_scaling: dict | None = None,
prefix: str = "",
) -> None:
super().__init__()
@ -332,8 +333,6 @@ class PhiMoEAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.qkv_proj = QKVParallelLinear(
hidden_size,
@ -355,9 +354,8 @@ class PhiMoEAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position,
base=int(self.rope_theta),
rope_parameters=rope_parameters,
is_neox_style=True,
rope_scaling=self.rope_scaling,
)
self.attn = Attention(
self.num_heads,
@ -393,7 +391,6 @@ class PhiMoEDecoderLayer(nn.Module):
super().__init__()
self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 10000)
self.self_attn = PhiMoEAttention(
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
@ -402,10 +399,9 @@ class PhiMoEDecoderLayer(nn.Module):
head_dim=getattr(
config, "head_dim", self.hidden_size // config.num_attention_heads
),
rope_theta=rope_theta,
cache_config=cache_config,
quant_config=quant_config,
rope_scaling=config.rope_scaling,
rope_parameters=config.rope_parameters,
prefix=f"{prefix}.self_attn",
)
self.block_sparse_moe = PhiMoE(

View File

@ -567,10 +567,6 @@ class Plamo2AttentionMixer(nn.Module):
prefix=f"{prefix}.o_proj",
)
self.rope_theta = config.rope_theta if hasattr(config, "rope_theta") else 10000
self.rope_scaling = (
config.rope_scaling if hasattr(config, "rope_scaling") else None
)
max_position = config.max_position_embeddings
if hasattr(vllm_config.model_config, "max_model_len") and isinstance(
vllm_config.model_config.max_model_len, int
@ -581,8 +577,7 @@ class Plamo2AttentionMixer(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position,
base=self.rope_theta,
rope_scaling=self.rope_scaling,
rope_parameters=config.rope_parameters,
)
self.q_norm = RMSNorm(config.hidden_size_per_head, eps=config.rms_norm_eps)
self.q_norm.weight = torch.nn.Parameter(

View File

@ -83,8 +83,7 @@ class QWenAttention(nn.Module):
hidden_size: int,
num_heads: int,
max_position_embeddings: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any] | None = None,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
prefix: str = "",
@ -117,8 +116,7 @@ class QWenAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
)
self.attn = Attention(
self.num_heads,
@ -153,14 +151,11 @@ class QWenBlock(nn.Module):
super().__init__()
self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
self.attn = QWenAttention(
config.hidden_size,
config.num_attention_heads,
config.max_position_embeddings,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.attn",

View File

@ -57,7 +57,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name,
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import is_interleaved
from vllm.transformers_utils.config import is_interleaved, set_default_rope_theta
from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
from .utils import (
@ -114,11 +114,10 @@ class Qwen2Attention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_parameters: dict[str, Any],
max_position: int = 4096 * 32,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
rope_scaling: tuple | None = None,
prefix: str = "",
attn_type: str = AttentionType.DECODER,
dual_chunk_attention_config: dict[str, Any] | None = None,
@ -143,7 +142,6 @@ class Qwen2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.dual_chunk_attention_config = dual_chunk_attention_config
self.qkv_proj = QKVParallelLinear(
@ -167,8 +165,7 @@ class Qwen2Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
dual_chunk_attention_config=dual_chunk_attention_config,
)
attn_cls = (
@ -216,9 +213,7 @@ class Qwen2DecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
set_default_rope_theta(config, default_theta=1000000)
dual_chunk_attention_config = getattr(
config, "dual_chunk_attention_config", None
)
@ -237,10 +232,9 @@ class Qwen2DecoderLayer(nn.Module):
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
cache_config=cache_config,
quant_config=quant_config,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
prefix=f"{prefix}.self_attn",
attn_type=attn_type,
dual_chunk_attention_config=dual_chunk_attention_config,

View File

@ -641,7 +641,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
head_size=head_dim,
rotary_dim=head_dim // 2,
max_position=8192,
base=10000.0,
is_neox_style=True,
)

View File

@ -194,8 +194,7 @@ class Qwen2MoeAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
@ -222,7 +221,6 @@ class Qwen2MoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.dual_chunk_attention_config = dual_chunk_attention_config
@ -248,8 +246,7 @@ class Qwen2MoeAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
dual_chunk_attention_config=dual_chunk_attention_config,
)
self.attn = Attention(
@ -291,8 +288,6 @@ class Qwen2MoeDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
dual_chunk_attention_config = getattr(
config, "dual_chunk_attention_config", None
)
@ -301,8 +296,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
cache_config=cache_config,
quant_config=quant_config,

View File

@ -643,7 +643,6 @@ class Qwen2VisionTransformer(nn.Module):
head_size=head_dim,
rotary_dim=head_dim // 2,
max_position=8192,
base=10000.0,
is_neox_style=True,
)

View File

@ -42,6 +42,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
from .qwen2 import Qwen2MLP as Qwen3MLP
@ -57,14 +58,13 @@ class Qwen3Attention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_parameters: dict,
max_position: int = 4096 * 32,
head_dim: int | None = None,
rms_norm_eps: float = 1e-06,
qkv_bias: bool = False,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
rope_scaling: tuple | None = None,
prefix: str = "",
attn_type: str = AttentionType.DECODER,
dual_chunk_attention_config: dict[str, Any] | None = None,
@ -89,7 +89,6 @@ class Qwen3Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.dual_chunk_attention_config = dual_chunk_attention_config
self.qkv_proj = QKVParallelLinear(
@ -113,8 +112,7 @@ class Qwen3Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
dual_chunk_attention_config=dual_chunk_attention_config,
)
self.attn = Attention(
@ -166,9 +164,7 @@ class Qwen3DecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
set_default_rope_theta(config, default_theta=1000000)
dual_chunk_attention_config = getattr(
config, "dual_chunk_attention_config", None
)
@ -187,13 +183,12 @@ class Qwen3DecoderLayer(nn.Module):
num_heads=config.num_attention_heads,
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rms_norm_eps=config.rms_norm_eps,
qkv_bias=getattr(config, "attention_bias", False),
head_dim=getattr(config, "head_dim", None),
cache_config=cache_config,
quant_config=quant_config,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
prefix=f"{prefix}.self_attn",
attn_type=attn_type,
dual_chunk_attention_config=dual_chunk_attention_config,

View File

@ -216,8 +216,7 @@ class Qwen3MoeAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
rope_parameters: dict[str, Any],
max_position_embeddings: int = 8192,
head_dim: int | None = None,
rms_norm_eps: float = 1e-06,
@ -247,7 +246,6 @@ class Qwen3MoeAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.dual_chunk_attention_config = dual_chunk_attention_config
@ -273,8 +271,7 @@ class Qwen3MoeAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
dual_chunk_attention_config=dual_chunk_attention_config,
)
self.attn = Attention(
@ -326,8 +323,6 @@ class Qwen3MoeDecoderLayer(nn.Module):
quant_config = vllm_config.quant_config
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
dual_chunk_attention_config = getattr(
config, "dual_chunk_attention_config", None
@ -336,8 +331,7 @@ class Qwen3MoeDecoderLayer(nn.Module):
hidden_size=self.hidden_size,
num_heads=config.num_attention_heads,
num_kv_heads=config.num_key_value_heads,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
max_position_embeddings=max_position_embeddings,
rms_norm_eps=config.rms_norm_eps,
qkv_bias=getattr(config, "attention_bias", False),

View File

@ -748,8 +748,7 @@ class Qwen3NextAttention(nn.Module):
head_size=self.head_dim,
rotary_dim=self.head_dim,
max_position=config.max_position_embeddings,
base=config.rope_theta,
rope_scaling=config.rope_scaling,
rope_parameters=config.rope_parameters,
partial_rotary_factor=config.partial_rotary_factor,
dual_chunk_attention_config=self.dual_chunk_attention_config,
)

View File

@ -338,7 +338,6 @@ class Qwen3Omni_VisionTransformer(nn.Module):
head_size=head_dim,
rotary_dim=head_dim // 2,
max_position=8192,
base=10000.0,
is_neox_style=True,
)

View File

@ -345,7 +345,6 @@ class Qwen3_VisionTransformer(nn.Module):
head_size=head_dim,
rotary_dim=head_dim // 2,
max_position=8192,
base=10000.0,
is_neox_style=True,
)

View File

@ -54,6 +54,7 @@ from vllm.model_executor.model_loader.weight_utils import (
maybe_remap_kv_scale_name,
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import set_default_rope_theta
from .interfaces import SupportsLoRA, SupportsPP
from .utils import (
@ -112,11 +113,10 @@ class SeedOssAttention(nn.Module):
num_heads: int,
num_kv_heads: int,
head_dim: int,
rope_parameters: dict,
max_position: int = 4096 * 32,
rope_theta: float = 10000,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
rope_scaling: tuple | None = None,
prefix: str = "",
attn_type: str = AttentionType.DECODER,
) -> None:
@ -140,7 +140,6 @@ class SeedOssAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.qkv_proj = QKVParallelLinear(
hidden_size,
@ -163,8 +162,7 @@ class SeedOssAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position,
base=self.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
)
self.attn = Attention(
self.num_heads,
@ -200,9 +198,7 @@ class SeedOssDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
# Requires transformers > 4.32.0
rope_theta = getattr(config, "rope_theta", 1000000)
rope_scaling = getattr(config, "rope_scaling", None)
set_default_rope_theta(config, default_theta=1000000)
# By default, SeedOss uses causal attention as it is a
# decoder-only model.
@ -219,10 +215,9 @@ class SeedOssDecoderLayer(nn.Module):
max_position=config.max_position_embeddings,
num_kv_heads=config.num_key_value_heads,
head_dim=config.head_dim,
rope_theta=rope_theta,
cache_config=cache_config,
quant_config=quant_config,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
prefix=f"{prefix}.self_attn",
attn_type=attn_type,
)

View File

@ -25,7 +25,6 @@
"""Inference-only Solar model compatible with HuggingFace weights."""
from collections.abc import Iterable
from typing import Any
import torch
from torch import nn
@ -111,8 +110,6 @@ class SolarAttention(nn.Module):
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: dict[str, Any] | None = None,
max_position_embeddings: int = 8192,
quant_config: QuantizationConfig | None = None,
bias: bool = False,
@ -142,7 +139,6 @@ class SolarAttention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = rope_theta
self.max_position_embeddings = max_position_embeddings
self.qkv_proj = QKVParallelLinear(
@ -166,8 +162,7 @@ class SolarAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
)
self.attn = Attention(
self.num_heads,
@ -202,15 +197,6 @@ class SolarDecoderLayer(nn.Module):
) -> None:
super().__init__()
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None and getattr(
config, "original_max_position_embeddings", None
):
rope_scaling["original_max_position_embeddings"] = (
config.original_max_position_embeddings
)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
# Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias
@ -224,8 +210,6 @@ class SolarDecoderLayer(nn.Module):
num_kv_heads=getattr(
config, "num_key_value_heads", config.num_attention_heads
),
rope_theta=rope_theta,
rope_scaling=rope_scaling,
max_position_embeddings=max_position_embeddings,
quant_config=quant_config,
bias=attention_bias,

View File

@ -153,7 +153,7 @@ class StablelmAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.config.max_position_embeddings,
base=self.config.rope_theta,
rope_parameters=self.config.rope_parameters,
partial_rotary_factor=self.partial_rotary_factor,
)
self.attn = Attention(

View File

@ -91,7 +91,6 @@ class Starcoder2Attention(nn.Module):
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
self.rope_theta = config.rope_theta
self.max_position_embeddings = config.max_position_embeddings
self.use_bias = config.use_bias
@ -115,7 +114,7 @@ class Starcoder2Attention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=int(self.rope_theta),
rope_parameters=config.rope_parameters,
is_neox_style=True,
)
self.attn = Attention(

View File

@ -36,6 +36,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs.step3_vl import Step3TextConfig
from .interfaces import SupportsPP
from .utils import (
@ -144,9 +145,8 @@ class Step3TextAttention(nn.Module):
num_heads: int,
num_kv_heads: int,
norm_eps: float,
rope_theta: int,
rope_parameters: dict[str, Any],
share_q_dim: int | None = None,
rope_scaling: dict[str, Any] | None = None,
max_position_embedding: int = 8192,
head_dim: int = 256,
cache_config: CacheConfig | None = None,
@ -198,8 +198,7 @@ class Step3TextAttention(nn.Module):
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embedding,
base=rope_theta,
rope_scaling=rope_scaling,
rope_parameters=rope_parameters,
)
scaling = self.head_dim**-0.5
self.attn = Attention(
@ -227,15 +226,13 @@ class Step3TextAttention(nn.Module):
class Step3TextDecoderLayer(nn.Module):
def __init__(
self,
config: ModelConfig,
config: Step3TextConfig,
cache_config: CacheConfig | None = None,
quant_config: QuantizationConfig | None = None,
prefix: str = "",
) -> None:
super().__init__()
config = config.hf_config
self.hidden_size = config.hidden_size
rope_scaling = getattr(config, "rope_scaling", None)
self.self_attn = Step3TextAttention(
hidden_size=self.hidden_size,
@ -247,8 +244,7 @@ class Step3TextDecoderLayer(nn.Module):
max_position_embedding=config.max_position_embedding,
head_dim=config.head_dim,
share_q_dim=config.share_q_dim,
rope_theta=config.rope_theta,
rope_scaling=rope_scaling,
rope_parameters=config.rope_parameters,
prefix=f"{prefix}.self_attn",
)
@ -338,7 +334,7 @@ class Step3TextModel(nn.Module):
self.start_layer, self.end_layer, self.layers = make_layers(
config.num_hidden_layers,
lambda prefix: Step3TextDecoderLayer(
config=vllm_config.model_config,
config=config,
cache_config=cache_config,
quant_config=quant_config,
prefix=prefix,

View File

@ -22,6 +22,7 @@ from typing import TYPE_CHECKING, Literal
import torch
from torch import nn
from transformers.configuration_utils import ALLOWED_LAYER_TYPES
from vllm.config.utils import getattr_iter
from vllm.logger import init_logger
@ -203,5 +204,10 @@ def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool:
"""
text_config = vllm_config.model_config.hf_config.get_text_config()
# Dynamic rope scaling is not compatible with torch.compile
rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {}
return rope_scaling.get("rope_type") != "dynamic"
rope_parameters: dict | None = getattr(text_config, "rope_parameters", None) or {}
if rope_parameters:
# Nest rope_parameters if not nested already to simplify logic
if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
rope_parameters = {"": rope_parameters}
return all(rp["rope_type"] != "dynamic" for rp in rope_parameters.values())
return True

View File

@ -128,7 +128,6 @@ class Zamba2Attention(nn.Module):
tp_size = get_tensor_model_parallel_world_size()
self.config = config
self.num_hybrid_layers = num_hybrid_layers
self.rope_theta = config.rope_theta
self.attention_hidden_size = config.attention_hidden_size
self.total_num_attention_heads = config.num_attention_heads
@ -233,8 +232,7 @@ class Zamba2Attention(nn.Module):
head_size=self.attention_head_dim,
rotary_dim=self.attention_head_dim,
max_position=config.max_position_embeddings,
base=self.rope_theta,
rope_scaling=None,
rope_parameters=config.rope_parameters,
is_neox_style=True,
)

View File

@ -7,8 +7,9 @@ import time
from collections.abc import Callable
from dataclasses import asdict
from functools import cache, partial
from importlib.metadata import version
from pathlib import Path
from typing import Any, Literal, TypeVar
from typing import Any, Literal, TypeAlias, TypeVar
import huggingface_hub
from huggingface_hub import (
@ -24,7 +25,9 @@ from huggingface_hub.utils import (
RepositoryNotFoundError,
RevisionNotFoundError,
)
from packaging.version import Version
from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig
from transformers.configuration_utils import ALLOWED_LAYER_TYPES
from transformers.models.auto.image_processing_auto import get_image_processor_config
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
@ -390,21 +393,61 @@ def file_or_path_exists(
)
def patch_rope_scaling(config: PretrainedConfig) -> None:
def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> None:
"""Some models may have no rope_theta in their config but still use RoPE.
This function sets a default rope_theta if it's missing."""
if getattr(config, "rope_parameters", None) is None:
config.rope_parameters = {"rope_type": "default"}
if "rope_theta" not in config.rope_parameters:
config.rope_parameters["rope_theta"] = default_theta
def patch_rope_parameters(config: PretrainedConfig) -> None:
"""Provide backwards compatibility for RoPE."""
text_config = getattr(config, "text_config", None)
if text_config is not None:
patch_rope_scaling(text_config)
# Retrieve rope_parameters differently based on Transformers version
if Version(version("transformers")) >= Version("5.0.0.dev0"):
from transformers.modeling_rope_utils import RopeParameters
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is not None:
patch_rope_scaling_dict(rope_scaling)
rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr(
config, "rope_parameters", None
)
elif hasattr(config, "rope_parameters"):
# We are in Transformers v4 and rope_parameters
# has already been patched for this config
return
else:
# Convert Transformers v4 rope_theta and rope_scaling into rope_parameters
rope_theta: float | None = getattr(config, "rope_theta", None)
rope_scaling: dict | None = getattr(config, "rope_scaling", None)
rope_parameters = rope_scaling
# Move rope_theta into rope_parameters
if rope_theta is not None:
rope_parameters = rope_parameters or {"rope_type": "default"}
rope_parameters["rope_theta"] = rope_theta
# Add original_max_position_embeddings if present
if rope_parameters and (
ompe := getattr(config, "original_max_position_embeddings", None)
):
rope_parameters["original_max_position_embeddings"] = ompe
# Write back to config
config.rope_parameters = rope_parameters
# No RoPE parameters to patch
if rope_parameters is None:
return
# Handle nested rope_parameters in interleaved sliding attention models
if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
for rope_parameters_layer_type in rope_parameters.values():
patch_rope_parameters_dict(rope_parameters_layer_type)
else:
patch_rope_parameters_dict(rope_parameters)
def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None:
if "rope_type" in rope_scaling and "type" in rope_scaling:
rope_type = rope_scaling["rope_type"]
rope_type_legacy = rope_scaling["type"]
def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
if "rope_type" in rope_parameters and "type" in rope_parameters:
rope_type = rope_parameters["rope_type"]
rope_type_legacy = rope_parameters["type"]
if rope_type != rope_type_legacy:
raise ValueError(
f"Found conflicts between 'rope_type={rope_type}' (modern "
@ -412,28 +455,28 @@ def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None:
"You should only specify one of them."
)
if "rope_type" not in rope_scaling and "type" in rope_scaling:
rope_scaling["rope_type"] = rope_scaling["type"]
if "rope_type" not in rope_parameters and "type" in rope_parameters:
rope_parameters["rope_type"] = rope_parameters["type"]
logger.info("Replacing legacy 'type' key with 'rope_type'")
if "rope_type" not in rope_scaling:
raise ValueError("rope_scaling should have a 'rope_type' key")
if "rope_type" not in rope_parameters:
raise ValueError("rope_parameters should have a 'rope_type' key")
if rope_scaling["rope_type"] == "su":
rope_scaling["rope_type"] = "longrope"
if rope_parameters["rope_type"] == "su":
rope_parameters["rope_type"] = "longrope"
logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
elif rope_scaling["rope_type"] == "mrope":
assert "mrope_section" in rope_scaling
rope_scaling["rope_type"] = "default"
elif rope_parameters["rope_type"] == "mrope":
assert "mrope_section" in rope_parameters
rope_parameters["rope_type"] = "default"
logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
def _uses_mrope(config: PretrainedConfig) -> bool:
rope_scaling = getattr(config, "rope_scaling", None)
if rope_scaling is None:
rope_parameters = getattr(config, "rope_parameters", None)
if rope_parameters is None:
return False
return "mrope_section" in rope_scaling
return "mrope_section" in rope_parameters
def uses_mrope(config: PretrainedConfig) -> bool:
@ -690,7 +733,14 @@ def get_config(
logger.debug("Overriding HF config with %s", hf_overrides_fn)
config = hf_overrides_fn(config)
patch_rope_scaling(config)
# Exhaustively patch RoPE parameters everywhere they might be
patch_rope_parameters(config)
patch_rope_parameters(config.get_text_config())
SubConfigs: TypeAlias = dict[str, PretrainedConfig]
sub_configs: SubConfigs | None = getattr(config, "sub_configs", None)
if sub_configs:
for sub_config in sub_configs:
patch_rope_parameters(getattr(config, sub_config))
if trust_remote_code:
maybe_register_config_serialize_by_value()

View File

@ -24,7 +24,7 @@ class AfmoeConfig(PretrainedConfig):
rms_norm_eps: float = 1e-5,
use_cache: bool = True,
tie_word_embeddings: bool = False,
rope_theta: float = 10000.0,
rope_parameters: dict | None = None,
rope_scaling: dict | None = None,
num_experts: int = 64,
num_experts_per_tok: int = 6,
@ -56,7 +56,10 @@ class AfmoeConfig(PretrainedConfig):
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
rope_theta = kwargs.pop("rope_theta", 10000.0)
if rope_parameters is None:
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.rope_parameters = rope_parameters
self.rope_scaling = rope_scaling
self.moe_intermediate_size = moe_intermediate_size

View File

@ -85,8 +85,15 @@ class ArcticConfig(PretrainedConfig):
The id of the "end-of-sequence" token.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether the model's input and output word embeddings should be tied.
rope_theta (`float`, *optional*, defaults to 1000000.0):
The base period of the RoPE embeddings.
rope_parameters (`dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:
`rope_theta` (`float`): The base period of the RoPE embeddings.
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
'llama3'], with 'default' being the original RoPE implementation.
sliding_window (`int`, *optional*):
Sliding window attention window size. If not specified, will default to `4096`.
attention_dropout (`float`, *optional*, defaults to 0.0):
@ -132,7 +139,7 @@ class ArcticConfig(PretrainedConfig):
bos_token_id=1,
eos_token_id=2,
tie_word_embeddings=False,
rope_theta=1e6,
rope_parameters: dict[str, Any] | None = None,
sliding_window=None,
attention_dropout=0.0,
num_experts_per_tok=1,
@ -165,7 +172,10 @@ class ArcticConfig(PretrainedConfig):
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
rope_theta = kwargs.pop("rope_theta", 1e6)
if rope_parameters is None:
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.rope_parameters = rope_parameters
self.attention_dropout = attention_dropout
self.num_experts_per_tok = num_experts_per_tok

View File

@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from transformers.configuration_utils import PretrainedConfig
@ -25,8 +26,7 @@ class FlexOlmoConfig(PretrainedConfig):
bos_token_id=None,
eos_token_id=100257,
tie_word_embeddings=False,
rope_theta=500000.0,
rope_scaling=None,
rope_parameters: dict[str, Any] | None = None,
attention_bias=False,
attention_dropout=0.0,
num_experts_per_tok=5,
@ -62,8 +62,13 @@ class FlexOlmoConfig(PretrainedConfig):
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 500000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.num_experts_per_tok = num_experts_per_tok
@ -73,5 +78,5 @@ class FlexOlmoConfig(PretrainedConfig):
self.norm_topk_prob = norm_topk_prob
# Validate the correctness of rotary position embeddings parameters
# BC: if there is a 'type' field, move it to 'rope_type'.
if self.rope_scaling is not None and "type" in self.rope_scaling:
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
if self.rope_parameters is not None and "type" in self.rope_parameters:
self.rope_parameters["rope_type"] = self.rope_parameters["type"]

View File

@ -29,8 +29,7 @@ class KimiLinearConfig(PretrainedConfig):
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
rope_theta=10000.0,
rope_scaling=None,
rope_parameters=None,
tie_word_embeddings=False,
moe_intermediate_size: int | None = None,
moe_renormalize: bool = True,
@ -73,8 +72,13 @@ class KimiLinearConfig(PretrainedConfig):
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
rope_scaling = kwargs.pop("rope_scaling", None)
rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
rope_theta = kwargs.pop("rope_theta", 10000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
self.q_lora_rank = q_lora_rank
self.kv_lora_rank = kv_lora_rank

View File

@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from transformers.configuration_utils import PretrainedConfig
@ -35,8 +36,8 @@ class Lfm2MoeConfig(PretrainedConfig):
End of stream token id.
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 1000000.0):
The base period of the RoPE embeddings.
rope_parameters (`dict`, *optional*):
The parameters of the RoPE embeddings.
max_position_embeddings (`int`, *optional*, defaults to 128000):
The maximum sequence length that this model might ever be used with.
use_cache (`bool`, *optional*, defaults to `True`):
@ -100,7 +101,7 @@ class Lfm2MoeConfig(PretrainedConfig):
bos_token_id: int = 1,
eos_token_id: int = 2,
tie_word_embeddings: bool = True,
rope_theta: float = 1000000.0,
rope_parameters: dict[str, Any] | None = None,
max_position_embeddings: int = 128_000,
use_cache: bool = True,
norm_eps: float = 0.00001,
@ -121,7 +122,10 @@ class Lfm2MoeConfig(PretrainedConfig):
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.rope_theta = rope_theta
rope_theta = kwargs.pop("rope_theta", 1000000.0)
if rope_parameters is None:
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
self.rope_parameters = rope_parameters
self.max_position_embeddings = max_position_embeddings
self.use_cache = use_cache
self.norm_eps = norm_eps

View File

@ -98,6 +98,6 @@ class MiDashengLMConfig(PretrainedConfig):
if text_config
else Qwen2_5OmniTextConfig()
)
self.text_config.rope_scaling = None # uses_mrope is false
self.text_config.rope_parameters = None # uses_mrope is false
self.audio_token_id = audio_token_id
super().__init__(**kwargs)

View File

@ -86,13 +86,13 @@ def _remap_mistral_yarn_args(config: dict) -> dict:
"apply_scale": "apply_yarn_scaling",
}
yarn_config = config.get("yarn") or {}
config["rope_scaling"] = {
config["rope_parameters"] = {
"rope_type": "yarn",
"mscale_all_dim": 1,
}
for old_name, new_name in yarn_config_map.items():
if old_name in yarn_config:
config["rope_scaling"][new_name] = yarn_config.pop(old_name)
config["rope_parameters"][new_name] = yarn_config.pop(old_name)
assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}"

Some files were not shown because too many files have changed in this diff Show More