diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a08d2262f0f30..efa88d5c68f6b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3492,7 +3492,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.eplb_step(is_dummy=True, is_profile=is_profile) logit_indices = np.cumsum(num_scheduled_tokens) - 1 - return hidden_states, hidden_states[logit_indices] + logit_indices_device = torch.from_numpy(logit_indices).to( + self.device, non_blocking=True + ) + return hidden_states, hidden_states[logit_indices_device] @torch.inference_mode() def _dummy_sampler_run(