Disable remote caching when calling compile_fx (#16611)

Signed-off-by: rzou <zou3519@gmail.com>
2025-12-25 10:36:32 +08:00 · 2025-04-16 01:18:28 -04:00 · 2025-04-16 01:18:28 -04:00 · 966c742ed2
commit 966c742ed2
parent 0d7d05f4b6
1 changed files with 13 additions and 0 deletions
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@ -290,6 +290,19 @@ class InductorAdaptor(CompilerInterface):
            # Dynamo metrics context, see method for more details.
            stack.enter_context(self.metrics_context())

+            # Disable remote caching. When these are on, on remote cache-hit,
+            # the monkey-patched functions never actually get called.
+            # vLLM today assumes and requires the monkey-patched functions to
+            # get hit.
+            # TODO(zou3519): we're going to replace this all with
+            # standalone_compile sometime.
+            if is_torch_equal_or_newer("2.6"):
+                stack.enter_context(
+                    torch._inductor.config.patch(fx_graph_remote_cache=False))
+                stack.enter_context(
+                    torch._functorch.config.patch(
+                        enable_remote_autograd_cache=False))
+
            compiled_graph = compile_fx(
                graph,
                example_inputs,