faster startup of vLLM (#982)

* update

---------

Co-authored-by: Robert Irvine <robert@seamlessml.com>
This commit is contained in:
Robert Irvine 2023-09-08 06:48:54 +01:00 committed by GitHub
parent 852ef5b4f5
commit 4b5bcf8906
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -259,8 +259,9 @@ class PagedAttentionWithRoPE(PagedAttention):
self.is_neox_style = is_neox_style
# Create the cos and sin cache.
inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2) / rotary_dim))
t = torch.arange(max_position).float()
inv_freq = 1.0 / (base**(
torch.arange(0, rotary_dim, 2, device="cuda") / rotary_dim))
t = torch.arange(max_position, device="cuda").float()
freqs = torch.einsum("i,j -> ij", t, inv_freq.float())
cos = freqs.cos()
sin = freqs.sin()