From 44ead56ad5c2fd90dad1b971889ae3493e8039a8 Mon Sep 17 00:00:00 2001 From: yewentao256 Date: Mon, 11 Aug 2025 14:07:29 +0000 Subject: [PATCH] fix set forward context error Signed-off-by: yewentao256 --- vllm/v1/worker/gpu_model_runner.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0eb29bebdd2f9..d56f56e1fce32 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2031,7 +2031,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Run the model. # Use persistent buffers for CUDA graphs. - self.maybe_setup_kv_connector(scheduler_output) + with set_forward_context(attn_metadata, + vllm_config=self.vllm_config, + num_tokens=num_input_tokens or 1, + num_tokens_across_dp=num_tokens_after_padding, + skip_cuda_graphs=skip_cuda_graphs): + self.maybe_setup_kv_connector(scheduler_output) model_output = self._run_model( attn_metadata=attn_metadata, num_scheduled_tokens=num_input_tokens,