mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 04:54:56 +08:00
[Bugfix] Fix encoder-only model support for transformers backend (#28021)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
428bc7bf1c
commit
0ff05e3770
@ -899,27 +899,27 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||
|
||||
_TRANSFORMERS_BACKEND_MODELS = {
|
||||
"TransformersEmbeddingModel": _HfExamplesInfo(
|
||||
"BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0"
|
||||
"BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0.dev"
|
||||
),
|
||||
"TransformersForSequenceClassification": _HfExamplesInfo(
|
||||
"papluca/xlm-roberta-base-language-detection",
|
||||
min_transformers_version="5.0.0",
|
||||
min_transformers_version="5.0.0.dev",
|
||||
),
|
||||
"TransformersForCausalLM": _HfExamplesInfo(
|
||||
"hmellor/Ilama-3.2-1B", trust_remote_code=True
|
||||
),
|
||||
"TransformersMultiModalForCausalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
|
||||
"TransformersMoEForCausalLM": _HfExamplesInfo(
|
||||
"allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0"
|
||||
"allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0.dev"
|
||||
),
|
||||
"TransformersMultiModalMoEForCausalLM": _HfExamplesInfo(
|
||||
"Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0"
|
||||
"Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0.dev"
|
||||
),
|
||||
"TransformersMoEEmbeddingModel": _HfExamplesInfo(
|
||||
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0"
|
||||
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
|
||||
),
|
||||
"TransformersMoEForSequenceClassification": _HfExamplesInfo(
|
||||
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0"
|
||||
"Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0.dev"
|
||||
),
|
||||
"TransformersMultiModalEmbeddingModel": _HfExamplesInfo("google/gemma-3-4b-it"),
|
||||
"TransformersMultiModalForSequenceClassification": _HfExamplesInfo(
|
||||
|
||||
@ -82,7 +82,7 @@ def test_models(
|
||||
from packaging.version import Version
|
||||
|
||||
installed = Version(transformers.__version__)
|
||||
required = Version("5.0.0")
|
||||
required = Version("5.0.0.dev")
|
||||
if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
|
||||
pytest.skip(
|
||||
"MoE models with the Transformers backend require "
|
||||
|
||||
@ -28,6 +28,7 @@ from transformers import AutoModel
|
||||
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||
|
||||
from vllm.attention import Attention, AttentionType
|
||||
from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
|
||||
from vllm.config.utils import getattr_iter
|
||||
from vllm.distributed import get_pp_group, get_tp_group
|
||||
from vllm.distributed.utils import get_pp_indices
|
||||
@ -317,7 +318,7 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
|
||||
# vLLM does not support encoder-decoder models, so if any encoder layer is
|
||||
# found in a text only model, we assume the whole model is an encoder model
|
||||
if has_encoder(self.model) and not is_multimodal(self.config):
|
||||
self.check_version("4.57.0.dev0", "encoder models support")
|
||||
self.check_version("5.0.0.dev0", "encoder models support")
|
||||
attn_type = AttentionType.ENCODER_ONLY
|
||||
else:
|
||||
attn_type = AttentionType.DECODER
|
||||
@ -336,7 +337,12 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
|
||||
):
|
||||
per_layer_sliding_window = self.config.sliding_window
|
||||
|
||||
attention_instances[i] = Attention(
|
||||
attn_cls = (
|
||||
EncoderOnlyAttention
|
||||
if attn_type == AttentionType.ENCODER_ONLY
|
||||
else Attention
|
||||
)
|
||||
attention_instances[i] = attn_cls(
|
||||
num_heads=num_heads,
|
||||
head_size=head_size,
|
||||
# NOTE: We use Llama scale as default, if it's set by
|
||||
|
||||
@ -115,7 +115,7 @@ direct_register_custom_op(
|
||||
|
||||
class MoEMixin(MixtureOfExperts):
|
||||
def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
|
||||
self.check_version("4.57.0.dev0", "MoE models support")
|
||||
self.check_version("5.0.0.dev0", "MoE models support")
|
||||
# Skip MixtureOfExperts.__init__ and call the next class in MRO
|
||||
super(MixtureOfExperts, self).__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user