From e67b4f2c2a216ff12d4f607caa3ba3409ae3f572 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 11 Sep 2023 00:26:35 -0700 Subject: [PATCH] Use FP32 in RoPE initialization (#1004) Co-authored-by: One --- tests/kernels/test_pos_encoding.py | 5 +++-- vllm/model_executor/layers/attention.py | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 1e591295e1cb3..0d255900d4c11 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -133,9 +133,10 @@ def test_rotary_embedding( device="cuda") # Create the rotary embedding. - inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2) / rotary_dim)) + inv_freq = 1.0 / (base**( + torch.arange(0, rotary_dim, 2, dtype=torch.float) / rotary_dim)) t = torch.arange(max_position).float() - freqs = torch.einsum("i,j -> ij", t, inv_freq.float()) + freqs = torch.einsum("i,j -> ij", t, inv_freq) cos = freqs.cos() sin = freqs.sin() cos_sin_cache = torch.cat((cos, sin), dim=-1) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index c35cd8a6e900e..5e9360a3c20ed 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -264,10 +264,10 @@ class PagedAttentionWithRoPE(PagedAttention): self.is_neox_style = is_neox_style # Create the cos and sin cache. - inv_freq = 1.0 / (base**( - torch.arange(0, rotary_dim, 2, device="cuda") / rotary_dim)) - t = torch.arange(max_position, device="cuda").float() - freqs = torch.einsum("i,j -> ij", t, inv_freq.float()) + inv_freq = 1.0 / (base**(torch.arange( + 0, rotary_dim, 2, dtype=torch.float, device="cuda") / rotary_dim)) + t = torch.arange(max_position, dtype=torch.float, device="cuda") + freqs = torch.einsum("i,j -> ij", t, inv_freq) cos = freqs.cos() sin = freqs.sin() cache = torch.cat((cos, sin), dim=-1)