mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-17 07:09:08 +08:00
[V1][Minor] Minor cleanup for GPU Model Runner (#13983)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
38acae6e97
commit
cd813c6d4d
@ -1187,8 +1187,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
# NOTE: Currently model is profiled with a single non-text
|
# NOTE: Currently model is profiled with a single non-text
|
||||||
# modality with the max possible input tokens even when
|
# modality with the max possible input tokens even when
|
||||||
# it supports multiple.
|
# it supports multiple.
|
||||||
max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality( # noqa: E501
|
max_tokens_by_modality_dict = (
|
||||||
self.model_config)
|
MULTIMODAL_REGISTRY.
|
||||||
|
get_max_tokens_per_item_by_nonzero_modality(self.model_config))
|
||||||
dummy_data_modality, max_tokens_per_mm_item = max(
|
dummy_data_modality, max_tokens_per_mm_item = max(
|
||||||
max_tokens_by_modality_dict.items(), key=lambda item: item[1])
|
max_tokens_by_modality_dict.items(), key=lambda item: item[1])
|
||||||
|
|
||||||
@ -1275,15 +1276,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
# maximum num_tokens.
|
# maximum num_tokens.
|
||||||
num_reqs = self.scheduler_config.max_num_seqs
|
num_reqs = self.scheduler_config.max_num_seqs
|
||||||
num_tokens = self.max_num_tokens
|
num_tokens = self.max_num_tokens
|
||||||
min_tokens_per_req: int = num_tokens // num_reqs
|
min_tokens_per_req = num_tokens // num_reqs
|
||||||
|
|
||||||
num_scheduled_tokens_list: List[int] = [min_tokens_per_req] * num_reqs
|
num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
|
||||||
num_scheduled_tokens_list[-1] += num_tokens % num_reqs
|
num_scheduled_tokens_list[-1] += num_tokens % num_reqs
|
||||||
assert sum(num_scheduled_tokens_list) == num_tokens
|
assert sum(num_scheduled_tokens_list) == num_tokens
|
||||||
assert len(num_scheduled_tokens_list) == num_reqs
|
assert len(num_scheduled_tokens_list) == num_reqs
|
||||||
|
|
||||||
num_scheduled_tokens: np.ndarray = np.array(num_scheduled_tokens_list,
|
num_scheduled_tokens = np.array(num_scheduled_tokens_list,
|
||||||
dtype=np.int32)
|
dtype=np.int32)
|
||||||
logit_indices = np.cumsum(num_scheduled_tokens) - 1
|
logit_indices = np.cumsum(num_scheduled_tokens) - 1
|
||||||
|
|
||||||
with self.maybe_profile_with_lora(self.lora_config,
|
with self.maybe_profile_with_lora(self.lora_config,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user