[BugFix][V1] Fix int32 token index overflow when preparing input ids (#16806)

This commit is contained in:
Yong Hoon Shin 2025-04-23 12:12:35 -07:00 committed by GitHub
parent 3cde34a4a4
commit 32d4b669d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 4 additions and 2 deletions

View File

@ -241,10 +241,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
device=self.device)
# OPTIMIZATION: Cache the tensors rather than creating them every step.
# Keep in int64 to avoid overflow with long context
self.arange_np = np.arange(max(self.max_num_reqs + 1,
self.max_model_len,
self.max_num_tokens),
dtype=np.int32)
dtype=np.int64)
# NOTE(woosuk): These tensors are "stateless", i.e., they are literally
# a faster version of creating a new tensor every time. Thus, we should
# not make any assumptions about the values in these tensors.

View File

@ -219,7 +219,8 @@ class TPUModelRunner:
# Range tensor with values [0 .. self.max_num_tokens - 1].
# Used to initialize positions / context_lens / seq_lens
self.arange_np = np.arange(self.max_num_tokens, dtype=np.int32)
# Keep in int64 to avoid overflow with long context
self.arange_np = np.arange(self.max_num_tokens, dtype=np.int64)
self.num_reqs_paddings = _get_req_paddings(
min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)