mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-01 11:27:04 +08:00
fix
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
405578121c
commit
9ee9d0e274
@ -311,6 +311,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
# Cached outputs.
|
# Cached outputs.
|
||||||
self._draft_token_ids: Optional[Union[list[list[int]],
|
self._draft_token_ids: Optional[Union[list[list[int]],
|
||||||
torch.Tensor]] = None
|
torch.Tensor]] = None
|
||||||
|
self._draft_req_ids: Optional[list[str]] = None
|
||||||
self.transfer_event = torch.cuda.Event()
|
self.transfer_event = torch.cuda.Event()
|
||||||
self.sampled_token_ids_pinned_cpu = torch.empty(
|
self.sampled_token_ids_pinned_cpu = torch.empty(
|
||||||
(self.max_model_len, 1),
|
(self.max_model_len, 1),
|
||||||
@ -997,13 +998,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
# list of tuple (mm_hash, position_info)
|
# list of tuple (mm_hash, position_info)
|
||||||
mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
|
mm_hashes_pos = list[tuple[str, PlaceholderRange]]()
|
||||||
for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
|
for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
|
||||||
req_state = self.requests[req_id]
|
req_data = self.requests.req_data[req_id]
|
||||||
|
|
||||||
for mm_input_id in encoder_input_ids:
|
for mm_input_id in encoder_input_ids:
|
||||||
mm_hash = req_state.mm_hashes[mm_input_id]
|
mm_hash = req_data.mm_hashes[mm_input_id]
|
||||||
mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
|
mm_kwargs.append(req_data.mm_kwargs[mm_input_id])
|
||||||
mm_hashes_pos.append(
|
mm_hashes_pos.append(
|
||||||
(mm_hash, req_state.mm_positions[mm_input_id]))
|
(mm_hash, req_data.mm_positions[mm_input_id]))
|
||||||
|
|
||||||
# Batch mm inputs as much as we can: if a request in the batch has
|
# Batch mm inputs as much as we can: if a request in the batch has
|
||||||
# multiple modalities or a different modality than the previous one,
|
# multiple modalities or a different modality than the previous one,
|
||||||
@ -1576,6 +1577,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
input_batch.spec_decode_metadata,
|
input_batch.spec_decode_metadata,
|
||||||
input_batch.spec_decode_common_attn_metadata,
|
input_batch.spec_decode_common_attn_metadata,
|
||||||
)
|
)
|
||||||
|
self._draft_req_ids = input_batch.req_ids
|
||||||
|
|
||||||
self.eplb_step()
|
self.eplb_step()
|
||||||
|
|
||||||
@ -1593,12 +1595,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
|
def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
|
||||||
if self._draft_token_ids is None:
|
if self._draft_token_ids is None:
|
||||||
return None
|
return None
|
||||||
req_ids = self.requests.req_ids
|
|
||||||
if isinstance(self._draft_token_ids, torch.Tensor):
|
if isinstance(self._draft_token_ids, torch.Tensor):
|
||||||
draft_token_ids = self._draft_token_ids.tolist()
|
draft_token_ids = self._draft_token_ids.tolist()
|
||||||
else:
|
else:
|
||||||
draft_token_ids = self._draft_token_ids
|
draft_token_ids = self._draft_token_ids
|
||||||
self._draft_token_ids = None
|
self._draft_token_ids = None
|
||||||
|
|
||||||
|
assert self._draft_req_ids
|
||||||
|
req_ids = self._draft_req_ids
|
||||||
|
self._draft_req_ids = None
|
||||||
return DraftTokenIds(req_ids, draft_token_ids)
|
return DraftTokenIds(req_ids, draft_token_ids)
|
||||||
|
|
||||||
def propose_draft_token_ids(
|
def propose_draft_token_ids(
|
||||||
@ -1614,7 +1619,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
if self.speculative_config.method == "ngram":
|
if self.speculative_config.method == "ngram":
|
||||||
assert isinstance(self.drafter, NgramProposer)
|
assert isinstance(self.drafter, NgramProposer)
|
||||||
draft_token_ids = self.propose_ngram_draft_token_ids(
|
draft_token_ids = self.propose_ngram_draft_token_ids(
|
||||||
sampled_token_ids)
|
input_batch, sampled_token_ids)
|
||||||
elif self.speculative_config.method == "medusa":
|
elif self.speculative_config.method == "medusa":
|
||||||
assert isinstance(self.drafter, MedusaProposer)
|
assert isinstance(self.drafter, MedusaProposer)
|
||||||
if sample_hidden_states.shape[0] == len(sampled_token_ids):
|
if sample_hidden_states.shape[0] == len(sampled_token_ids):
|
||||||
|
|||||||
@ -28,6 +28,7 @@ class RequestData:
|
|||||||
sampling_params: Optional[SamplingParams]
|
sampling_params: Optional[SamplingParams]
|
||||||
pooling_params: Optional[PoolingParams]
|
pooling_params: Optional[PoolingParams]
|
||||||
|
|
||||||
|
mm_hashes: list[str]
|
||||||
# M-RoPE (only for Qwen2/2.5-VL)
|
# M-RoPE (only for Qwen2/2.5-VL)
|
||||||
mrope_positions: Optional[torch.Tensor] = None
|
mrope_positions: Optional[torch.Tensor] = None
|
||||||
mrope_position_delta: Optional[int] = None
|
mrope_position_delta: Optional[int] = None
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user