mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-27 07:07:52 +08:00
[Bugfix] Fix ModernBert cuda graph capturing in v1 (#21901)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
parent
35afe1b30b
commit
429e4e2d42
@ -162,7 +162,8 @@ def mteb_test_embed_models(hf_runner,
|
|||||||
vllm_runner,
|
vllm_runner,
|
||||||
model_info: EmbedModelInfo,
|
model_info: EmbedModelInfo,
|
||||||
vllm_extra_kwargs=None,
|
vllm_extra_kwargs=None,
|
||||||
hf_model_callback=None):
|
hf_model_callback=None,
|
||||||
|
atol=MTEB_RERANK_TOL):
|
||||||
if not model_info.enable_test:
|
if not model_info.enable_test:
|
||||||
# A model family has many models with the same architecture,
|
# A model family has many models with the same architecture,
|
||||||
# and we don't need to test each one.
|
# and we don't need to test each one.
|
||||||
@ -198,7 +199,7 @@ def mteb_test_embed_models(hf_runner,
|
|||||||
print("SentenceTransformers:", st_dtype, st_main_score)
|
print("SentenceTransformers:", st_dtype, st_main_score)
|
||||||
print("Difference:", st_main_score - vllm_main_score)
|
print("Difference:", st_main_score - vllm_main_score)
|
||||||
|
|
||||||
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
|
assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
|
||||||
|
|
||||||
|
|
||||||
def run_mteb_rerank(cross_encoder, tasks, languages):
|
def run_mteb_rerank(cross_encoder, tasks, languages):
|
||||||
|
|||||||
@ -466,7 +466,7 @@ class BertEmbeddingModel(nn.Module, SupportsQuant):
|
|||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.Tensor],
|
input_ids: torch.Tensor,
|
||||||
positions: torch.Tensor,
|
positions: torch.Tensor,
|
||||||
token_type_ids: Optional[torch.Tensor] = None,
|
token_type_ids: Optional[torch.Tensor] = None,
|
||||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||||
|
|||||||
@ -8,13 +8,15 @@ from torch import nn
|
|||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from vllm.attention import Attention, AttentionType
|
from vllm.attention import Attention, AttentionType
|
||||||
|
from vllm.compilation.decorators import support_torch_compile
|
||||||
from vllm.config import CacheConfig, VllmConfig
|
from vllm.config import CacheConfig, VllmConfig
|
||||||
from vllm.distributed import (divide, get_tensor_model_parallel_rank,
|
from vllm.distributed import (divide, get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
tensor_model_parallel_all_reduce)
|
tensor_model_parallel_all_reduce)
|
||||||
from vllm.model_executor.layers.activation import (get_act_and_mul_fn,
|
from vllm.model_executor.layers.activation import (get_act_and_mul_fn,
|
||||||
get_act_fn)
|
get_act_fn)
|
||||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||||
|
fused_topk, torch_vllm_outplace_fused_experts)
|
||||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||||
MergedColumnParallelLinear,
|
MergedColumnParallelLinear,
|
||||||
QKVParallelLinear,
|
QKVParallelLinear,
|
||||||
@ -284,15 +286,22 @@ class NomicMoE(nn.Module):
|
|||||||
hidden_states = hidden_states.view(-1, self.hidden_size)
|
hidden_states = hidden_states.view(-1, self.hidden_size)
|
||||||
# router_logits: (num_tokens, n_experts)
|
# router_logits: (num_tokens, n_experts)
|
||||||
router_logits, _ = self.router(hidden_states)
|
router_logits, _ = self.router(hidden_states)
|
||||||
final_hidden_states = fused_moe(hidden_states,
|
# FIXME(Isotr0py): This implementation is too tricky,
|
||||||
self.w1,
|
# we should use FusedMoE instead in the future
|
||||||
self.w2,
|
# after supporting ungated activation for it.
|
||||||
router_logits,
|
topk_weights, topk_ids, _ = fused_topk(hidden_states,
|
||||||
self.top_k,
|
router_logits,
|
||||||
renormalize=False,
|
self.top_k,
|
||||||
inplace=False,
|
renormalize=False)
|
||||||
activation=self.hidden_act,
|
final_hidden_states = torch_vllm_outplace_fused_experts(
|
||||||
is_act_and_mul=False)
|
hidden_states=hidden_states,
|
||||||
|
w1=self.w1,
|
||||||
|
w2=self.w2,
|
||||||
|
topk_weights=topk_weights,
|
||||||
|
topk_ids=topk_ids,
|
||||||
|
activation=self.hidden_act,
|
||||||
|
is_act_and_mul=False,
|
||||||
|
)
|
||||||
|
|
||||||
if self.tp_size > 1:
|
if self.tp_size > 1:
|
||||||
final_hidden_states = tensor_model_parallel_all_reduce(
|
final_hidden_states = tensor_model_parallel_all_reduce(
|
||||||
@ -391,6 +400,7 @@ class BertWithRopeEncoder(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
@support_torch_compile
|
||||||
class BertWithRope(nn.Module, SupportsQuant):
|
class BertWithRope(nn.Module, SupportsQuant):
|
||||||
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
|
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
|
||||||
|
|
||||||
@ -407,7 +417,7 @@ class BertWithRope(nn.Module, SupportsQuant):
|
|||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.Tensor],
|
input_ids: torch.Tensor,
|
||||||
positions: torch.Tensor,
|
positions: torch.Tensor,
|
||||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||||
inputs_embeds: Optional[torch.Tensor] = None,
|
inputs_embeds: Optional[torch.Tensor] = None,
|
||||||
@ -554,20 +564,6 @@ class JinaRobertaModel(BertWithRope):
|
|||||||
"norm2": "mlp_ln",
|
"norm2": "mlp_ln",
|
||||||
})
|
})
|
||||||
|
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
input_ids: torch.Tensor,
|
|
||||||
position_ids: torch.Tensor,
|
|
||||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
|
||||||
inputs_embeds: Optional[torch.Tensor] = None,
|
|
||||||
token_type_ids: Optional[torch.Tensor] = None,
|
|
||||||
) -> torch.Tensor:
|
|
||||||
return super().forward(input_ids=input_ids,
|
|
||||||
positions=position_ids,
|
|
||||||
intermediate_tensors=intermediate_tensors,
|
|
||||||
inputs_embeds=inputs_embeds,
|
|
||||||
token_type_ids=token_type_ids)
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def jina_merge_lora_weights(self, weights: Iterable[tuple[str,
|
def jina_merge_lora_weights(self, weights: Iterable[tuple[str,
|
||||||
torch.Tensor]]):
|
torch.Tensor]]):
|
||||||
|
|||||||
@ -8,6 +8,7 @@ from torch import nn
|
|||||||
from transformers import ModernBertConfig
|
from transformers import ModernBertConfig
|
||||||
|
|
||||||
from vllm.attention import Attention, AttentionType
|
from vllm.attention import Attention, AttentionType
|
||||||
|
from vllm.compilation.decorators import support_torch_compile
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||||
from vllm.model_executor.layers.linear import (QKVParallelLinear,
|
from vllm.model_executor.layers.linear import (QKVParallelLinear,
|
||||||
@ -46,7 +47,7 @@ class ModernBertEmbeddings(nn.Module):
|
|||||||
input_ids: torch.Tensor,
|
input_ids: torch.Tensor,
|
||||||
inputs_embeds: Optional[torch.Tensor] = None,
|
inputs_embeds: Optional[torch.Tensor] = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
if inputs_embeds:
|
if inputs_embeds is not None:
|
||||||
return self.norm(inputs_embeds)
|
return self.norm(inputs_embeds)
|
||||||
else:
|
else:
|
||||||
inputs_embeds = self.tok_embeddings(input_ids)
|
inputs_embeds = self.tok_embeddings(input_ids)
|
||||||
@ -117,7 +118,7 @@ class ModernBertAttention(nn.Module):
|
|||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
position_ids: torch.Tensor,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
qkv, _ = self.Wqkv(hidden_states)
|
qkv, _ = self.Wqkv(hidden_states)
|
||||||
q, k, v = qkv.split([self.all_head_size] * 3, dim=-1)
|
q, k, v = qkv.split([self.all_head_size] * 3, dim=-1)
|
||||||
@ -169,9 +170,9 @@ class ModernBertLayer(nn.Module):
|
|||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
position_ids: torch.Tensor,
|
||||||
):
|
) -> torch.Tensor:
|
||||||
attn_outputs = self.attn(self.attn_norm(hidden_states),
|
attn_outputs = self.attn(hidden_states=self.attn_norm(hidden_states),
|
||||||
position_ids=position_ids)
|
position_ids=position_ids)
|
||||||
hidden_states = hidden_states + attn_outputs
|
hidden_states = hidden_states + attn_outputs
|
||||||
mlp_output = self.mlp(self.mlp_norm(hidden_states))
|
mlp_output = self.mlp(self.mlp_norm(hidden_states))
|
||||||
@ -192,13 +193,14 @@ class ModernBertEncoderLayer(nn.Module):
|
|||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
position_ids: torch.Tensor,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
for i, layer in enumerate(self.layers):
|
for i, layer in enumerate(self.layers):
|
||||||
hidden_states = layer(hidden_states, position_ids)
|
hidden_states = layer(hidden_states, position_ids)
|
||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
@support_torch_compile
|
||||||
class ModernBertModel(nn.Module):
|
class ModernBertModel(nn.Module):
|
||||||
hf_to_vllm_mapper = WeightsMapper(
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
orig_to_new_prefix={"layers.": "encoder_layer.layers."})
|
orig_to_new_prefix={"layers.": "encoder_layer.layers."})
|
||||||
@ -234,13 +236,11 @@ class ModernBertModel(nn.Module):
|
|||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.LongTensor] = None,
|
input_ids: torch.Tensor,
|
||||||
positions: Optional[torch.Tensor] = None,
|
positions: torch.Tensor,
|
||||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||||
inputs_embeds: Optional[torch.Tensor] = None,
|
inputs_embeds: Optional[torch.Tensor] = None,
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
position_ids = positions if positions is not None else position_ids
|
|
||||||
if inputs_embeds is not None:
|
if inputs_embeds is not None:
|
||||||
hidden_states = inputs_embeds
|
hidden_states = inputs_embeds
|
||||||
else:
|
else:
|
||||||
@ -249,7 +249,7 @@ class ModernBertModel(nn.Module):
|
|||||||
|
|
||||||
outputs = self.encoder_layer(
|
outputs = self.encoder_layer(
|
||||||
hidden_states=hidden_states,
|
hidden_states=hidden_states,
|
||||||
position_ids=position_ids,
|
position_ids=positions,
|
||||||
)
|
)
|
||||||
norm_outputs = self.final_norm(outputs)
|
norm_outputs = self.final_norm(outputs)
|
||||||
return norm_outputs
|
return norm_outputs
|
||||||
|
|||||||
@ -105,7 +105,7 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
|
|||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.Tensor],
|
input_ids: torch.Tensor,
|
||||||
positions: torch.Tensor,
|
positions: torch.Tensor,
|
||||||
token_type_ids: Optional[torch.Tensor] = None,
|
token_type_ids: Optional[torch.Tensor] = None,
|
||||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||||
@ -119,8 +119,8 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
|
|||||||
position_ids=positions,
|
position_ids=positions,
|
||||||
padding_idx=self.padding_idx)
|
padding_idx=self.padding_idx)
|
||||||
|
|
||||||
return self.model(input_ids=input_ids,
|
return self.model(input_ids,
|
||||||
position_ids=positions,
|
positions,
|
||||||
token_type_ids=token_type_ids,
|
token_type_ids=token_type_ids,
|
||||||
inputs_embeds=inputs_embeds,
|
inputs_embeds=inputs_embeds,
|
||||||
intermediate_tensors=intermediate_tensors)
|
intermediate_tensors=intermediate_tensors)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user