mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-10 09:26:55 +08:00
[Bugfix] DeepSeek Accuracy (#14476)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
parent
206e2577fa
commit
db84f5eb3b
@ -222,8 +222,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
|||||||
Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
|
Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
scaled_quantize)
|
scaled_quantize)
|
||||||
from vllm.model_executor.layers.rotary_embedding import (
|
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
|
||||||
DeepseekScalingRotaryEmbedding, RotaryEmbedding)
|
|
||||||
from vllm.utils import cdiv, round_down
|
from vllm.utils import cdiv, round_down
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -626,9 +625,12 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
|
|||||||
self.qk_head_dim = qk_head_dim
|
self.qk_head_dim = qk_head_dim
|
||||||
self.v_head_dim = v_head_dim
|
self.v_head_dim = v_head_dim
|
||||||
|
|
||||||
self.rotary_emb = rotary_emb
|
# Hack for V1 for now to avoid torch library overhead (since we are
|
||||||
self.use_yarn_rope = isinstance(rotary_emb,
|
# already inside an attention custom op), pull out the forward
|
||||||
DeepseekScalingRotaryEmbedding)
|
# method from the rotary embedding and call it directly
|
||||||
|
# TODO(lucas): we should probably find a cleaner way to do this
|
||||||
|
self.rotary_emb = rotary_emb._forward_method
|
||||||
|
|
||||||
self.q_proj = q_proj
|
self.q_proj = q_proj
|
||||||
self.kv_b_proj = kv_b_proj
|
self.kv_b_proj = kv_b_proj
|
||||||
self.o_proj = o_proj
|
self.o_proj = o_proj
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user