mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-30 10:38:45 +08:00
[BugFix][V1] Fix int32 token index overflow when preparing input ids (#16806)
This commit is contained in:
parent
3cde34a4a4
commit
32d4b669d0
@ -241,10 +241,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
device=self.device)
|
||||
|
||||
# OPTIMIZATION: Cache the tensors rather than creating them every step.
|
||||
# Keep in int64 to avoid overflow with long context
|
||||
self.arange_np = np.arange(max(self.max_num_reqs + 1,
|
||||
self.max_model_len,
|
||||
self.max_num_tokens),
|
||||
dtype=np.int32)
|
||||
dtype=np.int64)
|
||||
# NOTE(woosuk): These tensors are "stateless", i.e., they are literally
|
||||
# a faster version of creating a new tensor every time. Thus, we should
|
||||
# not make any assumptions about the values in these tensors.
|
||||
|
||||
@ -219,7 +219,8 @@ class TPUModelRunner:
|
||||
|
||||
# Range tensor with values [0 .. self.max_num_tokens - 1].
|
||||
# Used to initialize positions / context_lens / seq_lens
|
||||
self.arange_np = np.arange(self.max_num_tokens, dtype=np.int32)
|
||||
# Keep in int64 to avoid overflow with long context
|
||||
self.arange_np = np.arange(self.max_num_tokens, dtype=np.int64)
|
||||
self.num_reqs_paddings = _get_req_paddings(
|
||||
min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user