mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-01 18:17:05 +08:00
minor
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
b21393cd98
commit
efba25e21a
@ -30,6 +30,8 @@ class SamplingMetadata:
|
|||||||
repetition_penalties: torch.Tensor
|
repetition_penalties: torch.Tensor
|
||||||
|
|
||||||
token_ids: Optional[torch.Tensor]
|
token_ids: Optional[torch.Tensor]
|
||||||
|
num_tokens: Optional[torch.Tensor]
|
||||||
|
num_prompt_tokens: Optional[torch.Tensor]
|
||||||
|
|
||||||
# `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size,
|
# `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size,
|
||||||
# vocab size).
|
# vocab size).
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import triton
|
|||||||
import triton.language as tl
|
import triton.language as tl
|
||||||
|
|
||||||
from vllm.utils import cdiv
|
from vllm.utils import cdiv
|
||||||
from vllm.v1.worker.utils import CpuGpuBuffer
|
from vllm.v1.utils import CpuGpuBuffer
|
||||||
|
|
||||||
PAD_SLOT_ID = -1
|
PAD_SLOT_ID = -1
|
||||||
|
|
||||||
|
|||||||
@ -446,12 +446,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
new_token_ids = cached_reqs.new_token_ids[i]
|
new_token_ids = cached_reqs.new_token_ids[i]
|
||||||
self.requests.append_token_ids(req_index, new_token_ids)
|
self.requests.append_token_ids(req_index, new_token_ids)
|
||||||
|
|
||||||
if cached_reqs.new_block_ids[i] is not None:
|
req_new_block_ids = cached_reqs.new_block_ids[i]
|
||||||
|
if req_new_block_ids is not None:
|
||||||
req_indices.append(req_index)
|
req_indices.append(req_index)
|
||||||
for i, block_ids in enumerate(cached_reqs.new_block_ids[i]):
|
for group_id, block_ids in enumerate(req_new_block_ids):
|
||||||
x = cu_num_new_blocks[i][-1]
|
x = cu_num_new_blocks[group_id][-1]
|
||||||
cu_num_new_blocks[i].append(x + len(block_ids))
|
cu_num_new_blocks[group_id].append(x + len(block_ids))
|
||||||
new_block_ids[i].extend(block_ids)
|
new_block_ids[group_id].extend(block_ids)
|
||||||
# If the request is resumed from preemption, we need to
|
# If the request is resumed from preemption, we need to
|
||||||
# overwrite the existing block IDs.
|
# overwrite the existing block IDs.
|
||||||
overwrite.append(cached_reqs.resumed_from_preemption[i])
|
overwrite.append(cached_reqs.resumed_from_preemption[i])
|
||||||
@ -1686,7 +1687,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
dtype=torch.int32)
|
dtype=torch.int32)
|
||||||
common_attn_metadata, token_indices =\
|
common_attn_metadata, token_indices =\
|
||||||
self.drafter.prepare_inputs(
|
self.drafter.prepare_inputs(
|
||||||
common_attn_metadata, num_rejected_tokens_cpu)
|
input_batch.spec_decode_common_attn_metadata,
|
||||||
|
num_rejected_tokens_cpu)
|
||||||
|
|
||||||
target_token_ids = self.input_ids.gpu[token_indices]
|
target_token_ids = self.input_ids.gpu[token_indices]
|
||||||
# TODO(woosuk): Support M-RoPE.
|
# TODO(woosuk): Support M-RoPE.
|
||||||
@ -2142,10 +2144,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
num_actual_tokens=num_tokens,
|
num_actual_tokens=num_tokens,
|
||||||
max_query_len=max_query_len,
|
max_query_len=max_query_len,
|
||||||
max_seq_len=self.max_model_len,
|
max_seq_len=self.max_model_len,
|
||||||
block_table_tensor=self.requests.
|
block_table_tensor=self.block_tables.
|
||||||
block_tables[kv_cache_group_id].gpu[:num_reqs],
|
block_tables[kv_cache_group_id][:num_reqs],
|
||||||
slot_mapping=self.requests.slot_mappings[kv_cache_group_id]
|
slot_mapping=self.block_tables.
|
||||||
[:num_tokens],
|
slot_mappings[kv_cache_group_id][:num_tokens],
|
||||||
causal=True,
|
causal=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -2607,9 +2609,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
self.attn_groups.append(
|
self.attn_groups.append(
|
||||||
create_attn_groups(attn_backends, kv_cache_spec))
|
create_attn_groups(attn_backends, kv_cache_spec))
|
||||||
|
|
||||||
# Calculate reorder batch threshold (if neeeded)
|
|
||||||
self.calculate_reorder_batch_threshold()
|
|
||||||
|
|
||||||
def initialize_cudagraph_capture(self) -> None:
|
def initialize_cudagraph_capture(self) -> None:
|
||||||
min_cg_support = AttentionCGSupport.ALWAYS
|
min_cg_support = AttentionCGSupport.ALWAYS
|
||||||
min_cg_builder_name = None
|
min_cg_builder_name = None
|
||||||
@ -2679,62 +2678,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
self.compilation_config.cudagraph_mode,
|
self.compilation_config.cudagraph_mode,
|
||||||
self.uniform_decode_query_len)
|
self.uniform_decode_query_len)
|
||||||
|
|
||||||
def calculate_reorder_batch_threshold(self) -> None:
|
|
||||||
"""
|
|
||||||
Check that if any backends reorder batches; that the reordering
|
|
||||||
is compatible (e.g., decode threshold is the same)
|
|
||||||
"""
|
|
||||||
for group in self._attn_group_iterator():
|
|
||||||
attn_metadata_builder_i = group.metadata_builder
|
|
||||||
|
|
||||||
# check that if any backends reorder batches; that the reordering
|
|
||||||
# is compatible (e.g., decode threshold is the same)
|
|
||||||
reorder_batch_threshold_i = (
|
|
||||||
attn_metadata_builder_i.reorder_batch_threshold)
|
|
||||||
if reorder_batch_threshold_i is not None:
|
|
||||||
if self.reorder_batch_threshold is not None:
|
|
||||||
if reorder_batch_threshold_i != \
|
|
||||||
self.reorder_batch_threshold:
|
|
||||||
raise ValueError(
|
|
||||||
f"Attention backend reorders decodes with "
|
|
||||||
f"threshold {reorder_batch_threshold_i} but other "
|
|
||||||
f"backend uses threshold "
|
|
||||||
f"{self.reorder_batch_threshold}")
|
|
||||||
else:
|
|
||||||
self.reorder_batch_threshold = reorder_batch_threshold_i
|
|
||||||
|
|
||||||
def may_reinitialize_input_batch(self,
|
|
||||||
kv_cache_config: KVCacheConfig) -> None:
|
|
||||||
"""
|
|
||||||
Re-initialize the input batch if the block sizes are different from
|
|
||||||
`[self.cache_config.block_size]`. This usually happens when there
|
|
||||||
are multiple KV cache groups.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
kv_cache_config: The KV cache configuration.
|
|
||||||
"""
|
|
||||||
block_sizes = [
|
|
||||||
kv_cache_group.kv_cache_spec.block_size
|
|
||||||
for kv_cache_group in kv_cache_config.kv_cache_groups
|
|
||||||
]
|
|
||||||
if block_sizes != [self.cache_config.block_size]:
|
|
||||||
assert self.cache_config.cpu_offload_gb == 0, (
|
|
||||||
"Cannot re-initialize the input batch when CPU weight "
|
|
||||||
"offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501
|
|
||||||
"for more details.")
|
|
||||||
self.input_batch = InputBatch(
|
|
||||||
max_num_reqs=self.max_num_reqs,
|
|
||||||
max_model_len=self.max_model_len,
|
|
||||||
max_num_batched_tokens=self.max_num_tokens,
|
|
||||||
device=self.device,
|
|
||||||
pin_memory=self.pin_memory,
|
|
||||||
vocab_size=self.model_config.get_vocab_size(),
|
|
||||||
block_sizes=block_sizes,
|
|
||||||
is_spec_decode=bool(self.vllm_config.speculative_config),
|
|
||||||
logitsprocs=self.input_batch.logitsprocs,
|
|
||||||
is_pooling_model=self.is_pooling_model,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _allocate_kv_cache_tensors(
|
def _allocate_kv_cache_tensors(
|
||||||
self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
|
self, kv_cache_config: KVCacheConfig) -> dict[str, torch.Tensor]:
|
||||||
"""
|
"""
|
||||||
@ -2941,7 +2884,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
"""
|
"""
|
||||||
kv_cache_config = deepcopy(kv_cache_config)
|
kv_cache_config = deepcopy(kv_cache_config)
|
||||||
self.kv_cache_config = kv_cache_config
|
self.kv_cache_config = kv_cache_config
|
||||||
self.may_reinitialize_input_batch(kv_cache_config)
|
|
||||||
self.may_add_encoder_only_layers_to_kv_cache_config()
|
self.may_add_encoder_only_layers_to_kv_cache_config()
|
||||||
self.initialize_attn_backend(kv_cache_config)
|
self.initialize_attn_backend(kv_cache_config)
|
||||||
kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
|
kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
|
||||||
|
|||||||
@ -282,6 +282,8 @@ class RequestState:
|
|||||||
# TODO
|
# TODO
|
||||||
generators={},
|
generators={},
|
||||||
token_ids=None,
|
token_ids=None,
|
||||||
|
num_tokens=None,
|
||||||
|
num_prompt_tokens=None,
|
||||||
max_num_logprobs=None,
|
max_num_logprobs=None,
|
||||||
allowed_token_ids_mask=None,
|
allowed_token_ids_mask=None,
|
||||||
bad_words_token_ids={},
|
bad_words_token_ids={},
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user