[Bugfix] tpu_model_runner: set vllm config context when calling reset_dynamo_cache() (#30331)

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
2025-12-29 22:11:56 +08:00 · 2025-12-10 13:58:35 +01:00 · 2025-12-10 13:58:35 +01:00 · 53d2420b44
commit 53d2420b44
parent 9db78f34dc
1 changed files with 3 additions and 2 deletions
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@ -10,7 +10,7 @@ import torch
 import torch.nn as nn

 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed import (
    ensure_model_parallel_initialized,
    init_distributed_environment,
@ -207,7 +207,8 @@ class TPUWorker:
        # one compiled bytecode. Having one FX graph/cached bytecode per
        # compiled model is required for `support_torch_compile` decorator to
        # skip dynamo guard.
-        self.model_runner.reset_dynamo_cache()
+        with set_current_vllm_config(self.vllm_config):
+            self.model_runner.reset_dynamo_cache()

        # Get the maximum amount of memory used by the model weights and
        # intermediate activations.