fix set forward context error

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-05-21 15:17:00 +08:00 · 2025-08-11 14:07:29 +00:00 · 2025-08-11 14:07:29 +00:00 · 44ead56ad5
commit 44ead56ad5
parent 28e7c30b01
1 changed files with 6 additions and 1 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -2031,7 +2031,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):

        # Run the model.
        # Use persistent buffers for CUDA graphs.
-        self.maybe_setup_kv_connector(scheduler_output)
+        with set_forward_context(attn_metadata,
+                                 vllm_config=self.vllm_config,
+                                 num_tokens=num_input_tokens or 1,
+                                 num_tokens_across_dp=num_tokens_after_padding,
+                                 skip_cuda_graphs=skip_cuda_graphs):
+            self.maybe_setup_kv_connector(scheduler_output)
        model_output = self._run_model(
            attn_metadata=attn_metadata,
            num_scheduled_tokens=num_input_tokens,