mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-27 20:47:12 +08:00
fix
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
62d23b3006
commit
af7b6c5dd4
@ -16,8 +16,8 @@ class SamplingMetadata:
|
|||||||
all_greedy: bool
|
all_greedy: bool
|
||||||
all_random: bool
|
all_random: bool
|
||||||
|
|
||||||
top_p: torch.Tensor
|
top_p: Optional[torch.Tensor]
|
||||||
top_k: torch.Tensor
|
top_k: Optional[torch.Tensor]
|
||||||
|
|
||||||
generators: dict[int, torch.Generator]
|
generators: dict[int, torch.Generator]
|
||||||
|
|
||||||
|
|||||||
@ -67,7 +67,7 @@ class BlockTables:
|
|||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
device=self.device)
|
device=self.device)
|
||||||
self.num_blocks = torch.zeros(self.num_kv_cache_groups,
|
self.num_blocks = torch.zeros(self.num_kv_cache_groups,
|
||||||
self.max_num_reqs,
|
self.max_num_cached_reqs,
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
device=self.device)
|
device=self.device)
|
||||||
self.slot_mappings = torch.zeros(self.num_kv_cache_groups,
|
self.slot_mappings = torch.zeros(self.num_kv_cache_groups,
|
||||||
@ -119,8 +119,7 @@ class BlockTables:
|
|||||||
self.overwrite.np[:num_reqs] = overwrite
|
self.overwrite.np[:num_reqs] = overwrite
|
||||||
for i in range(self.num_kv_cache_groups):
|
for i in range(self.num_kv_cache_groups):
|
||||||
self.cu_num_new_blocks.np[i, :num_reqs + 1] = cu_num_new_blocks[i]
|
self.cu_num_new_blocks.np[i, :num_reqs + 1] = cu_num_new_blocks[i]
|
||||||
n = len(new_block_ids[i])
|
self.new_block_ids.np[i, :len(new_block_ids[i])] = new_block_ids[i]
|
||||||
self.new_block_ids.np[i, :n] = new_block_ids[i]
|
|
||||||
|
|
||||||
_append_block_ids_kernel[(num_reqs, self.num_kv_cache_groups)](
|
_append_block_ids_kernel[(num_reqs, self.num_kv_cache_groups)](
|
||||||
self.req_indices.copy_to_gpu(num_reqs),
|
self.req_indices.copy_to_gpu(num_reqs),
|
||||||
@ -154,17 +153,17 @@ class BlockTables:
|
|||||||
|
|
||||||
def compute_slot_mappings(
|
def compute_slot_mappings(
|
||||||
self,
|
self,
|
||||||
cu_num_tokens: torch.Tensor,
|
query_start_loc: torch.Tensor,
|
||||||
pos: torch.Tensor,
|
positions: torch.Tensor,
|
||||||
num_tokens: int,
|
|
||||||
) -> tuple[torch.Tensor, ...]:
|
) -> tuple[torch.Tensor, ...]:
|
||||||
num_reqs = cu_num_tokens.shape[0] - 1
|
num_reqs = query_start_loc.shape[0] - 1
|
||||||
|
num_tokens = positions.shape[0]
|
||||||
num_groups = self.num_kv_cache_groups
|
num_groups = self.num_kv_cache_groups
|
||||||
_compute_slot_mappings_kernel[(num_reqs + 1, num_groups)](
|
_compute_slot_mappings_kernel[(num_reqs + 1, num_groups)](
|
||||||
num_tokens,
|
num_tokens,
|
||||||
self.max_num_batched_tokens,
|
self.max_num_batched_tokens,
|
||||||
cu_num_tokens,
|
query_start_loc,
|
||||||
pos,
|
positions,
|
||||||
self.block_table_ptrs,
|
self.block_table_ptrs,
|
||||||
self.block_table_strides,
|
self.block_table_strides,
|
||||||
self.block_sizes_tensor,
|
self.block_sizes_tensor,
|
||||||
@ -188,7 +187,7 @@ def _append_block_ids_kernel(
|
|||||||
block_table_strides, # [num_kv_cache_groups]
|
block_table_strides, # [num_kv_cache_groups]
|
||||||
# Outputs
|
# Outputs
|
||||||
block_table_buffer_ptrs, # [num_kv_cache_groups]
|
block_table_buffer_ptrs, # [num_kv_cache_groups]
|
||||||
num_blocks_ptr, # [num_kv_cache_groups, max_num_reqs]
|
num_blocks_ptr, # [num_kv_cache_groups, max_num_cached_reqs]
|
||||||
num_blocks_stride,
|
num_blocks_stride,
|
||||||
# Constants
|
# Constants
|
||||||
BLOCK_SIZE: tl.constexpr,
|
BLOCK_SIZE: tl.constexpr,
|
||||||
@ -235,7 +234,7 @@ def _compute_block_tables_kernel(
|
|||||||
src_block_table_ptrs, # [num_kv_cache_groups]
|
src_block_table_ptrs, # [num_kv_cache_groups]
|
||||||
dst_block_table_ptrs, # [num_kv_cache_groups]
|
dst_block_table_ptrs, # [num_kv_cache_groups]
|
||||||
block_table_strides, # [num_kv_cache_groups]
|
block_table_strides, # [num_kv_cache_groups]
|
||||||
num_blocks_ptr, # [num_kv_cache_groups, max_num_reqs]
|
num_blocks_ptr, # [num_kv_cache_groups, max_num_cached_reqs]
|
||||||
num_blocks_stride,
|
num_blocks_stride,
|
||||||
BLOCK_SIZE: tl.constexpr,
|
BLOCK_SIZE: tl.constexpr,
|
||||||
):
|
):
|
||||||
|
|||||||
@ -262,15 +262,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
# None in the first PP rank. The rest are set after load_model.
|
# None in the first PP rank. The rest are set after load_model.
|
||||||
self.intermediate_tensors: Optional[IntermediateTensors] = None
|
self.intermediate_tensors: Optional[IntermediateTensors] = None
|
||||||
|
|
||||||
self.block_tables = BlockTables(
|
|
||||||
block_sizes=[self.cache_config.block_size],
|
|
||||||
max_num_reqs=self.max_num_reqs,
|
|
||||||
max_num_cached_reqs=2 * self.max_num_reqs,
|
|
||||||
max_num_batched_tokens=self.max_num_tokens,
|
|
||||||
max_model_len=self.max_model_len,
|
|
||||||
device=self.device,
|
|
||||||
pin_memory=self.pin_memory,
|
|
||||||
)
|
|
||||||
self.idx_mapping = self._make_buffer(self.max_num_reqs,
|
self.idx_mapping = self._make_buffer(self.max_num_reqs,
|
||||||
dtype=torch.int32)
|
dtype=torch.int32)
|
||||||
|
|
||||||
@ -581,7 +572,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
|
|
||||||
# Compute the slot mappings on GPUs.
|
# Compute the slot mappings on GPUs.
|
||||||
slot_mappings = self.block_tables.compute_slot_mappings(
|
slot_mappings = self.block_tables.compute_slot_mappings(
|
||||||
query_start_loc, self.positions.gpu, total_num_scheduled_tokens)
|
query_start_loc, self.positions.gpu[:total_num_scheduled_tokens])
|
||||||
|
|
||||||
if self.uses_mrope:
|
if self.uses_mrope:
|
||||||
self._calc_mrope_positions(req_ids, num_scheduled_tokens)
|
self._calc_mrope_positions(req_ids, num_scheduled_tokens)
|
||||||
@ -635,8 +626,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
# Used in the below loop.
|
# Used in the below loop.
|
||||||
query_start_loc_cpu = self.query_start_loc.cpu[:num_reqs + 1]
|
query_start_loc_cpu = self.query_start_loc.cpu[:num_reqs + 1]
|
||||||
seq_lens_cpu = self.seq_lens.cpu[:num_reqs]
|
seq_lens_cpu = self.seq_lens.cpu[:num_reqs]
|
||||||
num_computed_tokens_cpu = (
|
num_computed_tokens_np = self.requests.num_computed_tokens.np[
|
||||||
self.requests.num_computed_tokens.cpu[:num_reqs])
|
idx_mapping_np]
|
||||||
|
num_computed_tokens_cpu = torch.from_numpy(num_computed_tokens_np)
|
||||||
spec_decode_common_attn_metadata = None
|
spec_decode_common_attn_metadata = None
|
||||||
|
|
||||||
attn_metadata: dict[str, Any] = {}
|
attn_metadata: dict[str, Any] = {}
|
||||||
@ -1444,16 +1436,29 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
)
|
)
|
||||||
sampler_output.sampled_token_ids = output_token_ids
|
sampler_output.sampled_token_ids = output_token_ids
|
||||||
|
|
||||||
for i in range(input_batch.num_reqs):
|
|
||||||
req_idx = input_batch.idx_mapping_np[i]
|
|
||||||
num_tokens = input_batch.num_scheduled_tokens[i]
|
|
||||||
self.requests.num_computed_tokens.np[req_idx] += num_tokens
|
|
||||||
|
|
||||||
num_nans_in_logits = {}
|
num_nans_in_logits = {}
|
||||||
if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
|
if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
|
||||||
num_nans_in_logits = self._get_nans_in_logits(
|
num_nans_in_logits = self._get_nans_in_logits(
|
||||||
input_batch.req_ids, logits)
|
input_batch.req_ids, logits)
|
||||||
|
|
||||||
|
# TODO(woosuk): The following loop can be slow since it iterates over
|
||||||
|
# the requests one by one. Optimize.
|
||||||
|
discard_sampled_tokens_req_indices: list[int] = []
|
||||||
|
for i, req_id in enumerate(input_batch.req_ids):
|
||||||
|
req_idx = self.requests.req_id_to_index[req_id]
|
||||||
|
seq_len = (self.requests.num_computed_tokens.np[req_idx] +
|
||||||
|
input_batch.num_scheduled_tokens[i])
|
||||||
|
if seq_len < self.requests.num_tokens.np[req_idx]:
|
||||||
|
# Ignore the sampled token for partial prefills.
|
||||||
|
# Rewind the generator state as if the token was not sampled.
|
||||||
|
# This relies on cuda-specific torch-internal impl details
|
||||||
|
generator = self.requests.generators.get(req_idx)
|
||||||
|
if generator is not None:
|
||||||
|
generator.set_offset(generator.get_offset() - 4)
|
||||||
|
# Record the index of the request that should not be sampled,
|
||||||
|
# so that we could clear the sampled tokens before returning.
|
||||||
|
discard_sampled_tokens_req_indices.append(i)
|
||||||
|
|
||||||
# NOTE: GPU -> CPU Sync happens here.
|
# NOTE: GPU -> CPU Sync happens here.
|
||||||
# Move as many CPU operations as possible before this sync point.
|
# Move as many CPU operations as possible before this sync point.
|
||||||
logprobs_tensors = sampler_output.logprobs_tensors
|
logprobs_tensors = sampler_output.logprobs_tensors
|
||||||
@ -1471,23 +1476,36 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
max_gen_len = sampled_token_ids.shape[-1]
|
max_gen_len = sampled_token_ids.shape[-1]
|
||||||
if max_gen_len == 1:
|
if max_gen_len == 1:
|
||||||
# No spec decode tokens.
|
# No spec decode tokens.
|
||||||
valid_sampled_token_ids_np = sampled_token_ids.cpu().numpy()
|
valid_sampled_token_ids = self._to_list(sampled_token_ids)
|
||||||
valid_sampled_token_ids = valid_sampled_token_ids_np.tolist()
|
|
||||||
# valid_sampled_token_ids = self._to_list(sampled_token_ids)
|
|
||||||
else:
|
else:
|
||||||
# Includes spec decode tokens.
|
# Includes spec decode tokens.
|
||||||
valid_sampled_token_ids = self.rejection_sampler.parse_output(
|
valid_sampled_token_ids = self.rejection_sampler.parse_output(
|
||||||
sampled_token_ids, self.vocab_size)
|
sampled_token_ids, self.vocab_size)
|
||||||
|
# Mask out the sampled tokens that should not be sampled.
|
||||||
|
for i in discard_sampled_tokens_req_indices:
|
||||||
|
valid_sampled_token_ids[i].clear()
|
||||||
|
|
||||||
# Cache the sampled tokens in the model runner, so that the scheduler
|
# Cache the sampled tokens in the model runner, so that the scheduler
|
||||||
# doesn't need to send them back.
|
# doesn't need to send them back.
|
||||||
# NOTE(woosuk): As an exception, when using PP, the scheduler sends
|
# NOTE(woosuk): As an exception, when using PP, the scheduler sends
|
||||||
# the sampled tokens back, because there's no direct communication
|
# the sampled tokens back, because there's no direct communication
|
||||||
# between the first-stage worker and the last-stage worker.
|
# between the first-stage worker and the last-stage worker.
|
||||||
self.requests.append_sampled_token_ids(
|
for i, req_id in enumerate(input_batch.req_ids):
|
||||||
input_batch.idx_mapping_np,
|
sampled_ids = valid_sampled_token_ids[i]
|
||||||
valid_sampled_token_ids,
|
if not sampled_ids:
|
||||||
)
|
continue
|
||||||
|
req_idx = self.requests.req_id_to_index[req_id]
|
||||||
|
|
||||||
|
start_idx = self.requests.num_tokens.np[req_idx]
|
||||||
|
end_idx = start_idx + len(sampled_ids)
|
||||||
|
assert end_idx <= self.max_model_len, (
|
||||||
|
"Sampled token IDs exceed the max model length. "
|
||||||
|
f"Total number of tokens: {end_idx} > max_model_len: "
|
||||||
|
f"{self.max_model_len}")
|
||||||
|
|
||||||
|
self.requests.token_ids.np[req_idx,
|
||||||
|
start_idx:end_idx] = sampled_ids
|
||||||
|
self.requests.num_tokens.np[req_idx] = end_idx
|
||||||
|
|
||||||
if self.speculative_config:
|
if self.speculative_config:
|
||||||
assert input_batch.spec_decode_common_attn_metadata is not None
|
assert input_batch.spec_decode_common_attn_metadata is not None
|
||||||
@ -1648,14 +1666,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
draft_token_ids.append([])
|
draft_token_ids.append([])
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip requests that require sampling parameters that are not
|
# # Skip requests that require sampling parameters that are not
|
||||||
# supported with speculative decoding.
|
# # supported with speculative decoding.
|
||||||
req_id = input_batch.req_ids[i]
|
# req_id = input_batch.req_ids[i]
|
||||||
if req_id in self.requests.spec_decode_unsupported_reqs:
|
# if req_id in self.requests.spec_decode_unsupported_reqs:
|
||||||
draft_token_ids.append([])
|
# draft_token_ids.append([])
|
||||||
continue
|
# continue
|
||||||
|
|
||||||
num_tokens = self.requests.num_tokens_no_spec[i]
|
num_tokens = self.requests.num_tokens.np[i]
|
||||||
if num_tokens >= self.max_model_len:
|
if num_tokens >= self.max_model_len:
|
||||||
# Skip requests that have already reached the max model length.
|
# Skip requests that have already reached the max model length.
|
||||||
draft_token_ids.append([])
|
draft_token_ids.append([])
|
||||||
@ -2824,6 +2842,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
def init_block_tables(self, kv_cache_config: KVCacheConfig) -> None:
|
||||||
|
block_sizes = [
|
||||||
|
kv_cache_group.kv_cache_spec.block_size
|
||||||
|
for kv_cache_group in kv_cache_config.kv_cache_groups
|
||||||
|
]
|
||||||
|
self.block_tables = BlockTables(
|
||||||
|
block_sizes=block_sizes,
|
||||||
|
max_num_reqs=self.max_num_reqs,
|
||||||
|
max_num_cached_reqs=2 * self.max_num_reqs,
|
||||||
|
max_num_batched_tokens=self.max_num_tokens,
|
||||||
|
max_model_len=self.max_model_len,
|
||||||
|
device=self.device,
|
||||||
|
pin_memory=self.pin_memory,
|
||||||
|
)
|
||||||
|
|
||||||
def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
|
def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
|
||||||
"""
|
"""
|
||||||
Initialize KV cache based on `kv_cache_config`.
|
Initialize KV cache based on `kv_cache_config`.
|
||||||
@ -2833,6 +2866,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
"""
|
"""
|
||||||
kv_cache_config = deepcopy(kv_cache_config)
|
kv_cache_config = deepcopy(kv_cache_config)
|
||||||
self.kv_cache_config = kv_cache_config
|
self.kv_cache_config = kv_cache_config
|
||||||
|
self.init_block_tables(kv_cache_config)
|
||||||
self.may_add_encoder_only_layers_to_kv_cache_config()
|
self.may_add_encoder_only_layers_to_kv_cache_config()
|
||||||
self.maybe_add_kv_sharing_layers_to_kv_cache_groups(kv_cache_config)
|
self.maybe_add_kv_sharing_layers_to_kv_cache_groups(kv_cache_config)
|
||||||
self.initialize_attn_backend(kv_cache_config)
|
self.initialize_attn_backend(kv_cache_config)
|
||||||
|
|||||||
@ -227,16 +227,6 @@ class RequestState:
|
|||||||
self.token_ids.np[req_idx, start_idx:end_idx] = token_ids
|
self.token_ids.np[req_idx, start_idx:end_idx] = token_ids
|
||||||
self.num_tokens.np[req_idx] = end_idx
|
self.num_tokens.np[req_idx] = end_idx
|
||||||
|
|
||||||
def append_sampled_token_ids(
|
|
||||||
self,
|
|
||||||
idx_mapping: np.ndarray,
|
|
||||||
sampled_token_ids: np.ndarray,
|
|
||||||
) -> None:
|
|
||||||
num_reqs = idx_mapping.shape[0]
|
|
||||||
for i in range(num_reqs):
|
|
||||||
req_idx = idx_mapping[i]
|
|
||||||
self.append_token_ids(req_idx, sampled_token_ids[i])
|
|
||||||
|
|
||||||
def remove_request(self, req_id: str) -> None:
|
def remove_request(self, req_id: str) -> None:
|
||||||
req_idx = self.req_id_to_index.pop(req_id, None)
|
req_idx = self.req_id_to_index.pop(req_id, None)
|
||||||
if req_idx is None:
|
if req_idx is None:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user