mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-03 08:31:24 +08:00
parent
333681408f
commit
ca7a2d5f28
@ -161,13 +161,8 @@ class RotaryEmbedding(CustomOp):
|
|||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
|
|
||||||
# __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
|
self.cos_sin_cache = self.cos_sin_cache.to(query.device,
|
||||||
# is expensive, so avoid calling it if possible
|
dtype=query.dtype)
|
||||||
if self.cos_sin_cache.device != query.device or \
|
|
||||||
self.cos_sin_cache.dtype != query.dtype:
|
|
||||||
self.cos_sin_cache = self.cos_sin_cache.to(query.device,
|
|
||||||
dtype=query.dtype)
|
|
||||||
|
|
||||||
# ops.rotary_embedding()/batched_rotary_embedding()
|
# ops.rotary_embedding()/batched_rotary_embedding()
|
||||||
# are in-place operations that update the query and key tensors.
|
# are in-place operations that update the query and key tensors.
|
||||||
if offsets is not None:
|
if offsets is not None:
|
||||||
|
|||||||
@ -222,8 +222,8 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
|||||||
Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
|
Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
scaled_quantize)
|
scaled_quantize)
|
||||||
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
|
from vllm.model_executor.layers.rotary_embedding import (
|
||||||
from vllm.platforms import current_platform
|
DeepseekScalingRotaryEmbedding, RotaryEmbedding)
|
||||||
from vllm.utils import cdiv, round_down
|
from vllm.utils import cdiv, round_down
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -627,15 +627,8 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
|
|||||||
self.v_head_dim = v_head_dim
|
self.v_head_dim = v_head_dim
|
||||||
|
|
||||||
self.rotary_emb = rotary_emb
|
self.rotary_emb = rotary_emb
|
||||||
|
self.use_yarn_rope = isinstance(rotary_emb,
|
||||||
if current_platform.is_cuda():
|
DeepseekScalingRotaryEmbedding)
|
||||||
# Hack for V1 for now to avoid torch library overhead (since we are
|
|
||||||
# already inside an attention custom op), pull out the forward
|
|
||||||
# method from the rotary embedding and call it directly (and avoid
|
|
||||||
# calling forward_native, when we can call forward_cuda)
|
|
||||||
# TODO(lucas): we should probably find a cleaner way to do this
|
|
||||||
self.rotary_emb = rotary_emb.forward_cuda
|
|
||||||
|
|
||||||
self.q_proj = q_proj
|
self.q_proj = q_proj
|
||||||
self.kv_b_proj = kv_b_proj
|
self.kv_b_proj = kv_b_proj
|
||||||
self.o_proj = o_proj
|
self.o_proj = o_proj
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user