mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-05 05:24:37 +08:00
[Perf][Async Scheduling] Remove CPU->GPU sync in dummy_run (#27455)
Signed-off-by: Lehua Ding <lehuading@tencent.com>
This commit is contained in:
parent
17af6aa0da
commit
0402428200
@ -3492,7 +3492,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
self.eplb_step(is_dummy=True, is_profile=is_profile)
|
self.eplb_step(is_dummy=True, is_profile=is_profile)
|
||||||
|
|
||||||
logit_indices = np.cumsum(num_scheduled_tokens) - 1
|
logit_indices = np.cumsum(num_scheduled_tokens) - 1
|
||||||
return hidden_states, hidden_states[logit_indices]
|
logit_indices_device = torch.from_numpy(logit_indices).to(
|
||||||
|
self.device, non_blocking=True
|
||||||
|
)
|
||||||
|
return hidden_states, hidden_states[logit_indices_device]
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def _dummy_sampler_run(
|
def _dummy_sampler_run(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user