[BugFix] VLLM_DISABLE_COMPILE_CACHE=1 should disable all reads and writes from the cache (#20942)

Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-12-14 06:55:01 +08:00 · 2025-07-14 21:26:18 -04:00 · 2025-07-14 21:26:18 -04:00 · ba8c300018
commit ba8c300018
parent 8cdc371217
4 changed files with 33 additions and 2 deletions
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@ -26,6 +26,30 @@ def test_use_cudagraphs_dynamic(monkeypatch):
    assert not vllm_config.compilation_config.use_cudagraph
 # NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
 # on the state of the cache directory on the current machine, which
 # may be influenced by other tests.
@pytest.mark.parametrize("val", ["1"])
 def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
    assert vllm.envs.VLLM_USE_V1
    # spawn means that the counters are in the same process.
    monkeypatch.setenv('VLLM_WORKER_MULTIPROC_METHOD', "spawn")
    monkeypatch.setenv('VLLM_DISABLE_COMPILE_CACHE', val)
    compilation_config = {
        "use_cudagraph": False,  # speed things up a bit
    }
    with (
            compilation_counter.expect(num_cache_entries_updated=0,
                                       num_compiled_artifacts_saved=0),
            # loading the model causes compilation (if enabled) to happen
            vllm_runner('facebook/opt-125m',
                        compilation_config=compilation_config,
                        gpu_memory_utilization=0.4) as _):
        pass
@pytest.mark.parametrize("enabled", [True, False])
 def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
    assert vllm.envs.VLLM_USE_V1
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@ -183,9 +183,10 @@ class CompilerManager:
        assert compiled_graph is not None, "Failed to compile the graph"
        # store the artifact in the cache
-        if handle is not None:
+        if not envs.VLLM_DISABLE_COMPILE_CACHE and handle is not None:
            self.cache[(runtime_shape, graph_index,
                        self.compiler.name)] = handle
            compilation_counter.num_cache_entries_updated += 1
            self.is_cache_updated = True
            if graph_index == 0:
                # adds some info logging for the first graph
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@ -213,7 +213,9 @@ class InductorStandaloneAdaptor(CompilerInterface):
        # Save the compiled artifact to disk in the specified path
        assert key is not None
        path = os.path.join(self.cache_dir, key)
        if not envs.VLLM_DISABLE_COMPILE_CACHE:
            compiled_graph.save(path=path, format="unpacked")
            compilation_counter.num_compiled_artifacts_saved += 1
        return compiled_graph, (key, path)
    def load(self,
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@ -23,6 +23,10 @@ class CompilationCounter:
    num_inductor_compiles: int = 0
    # EagerAdapter.compile calls
    num_eager_compiles: int = 0
    # The number of time vLLM's compiler cache entry was updated
    num_cache_entries_updated: int = 0
    # The number of standalone_compile compiled artifacts saved
    num_compiled_artifacts_saved: int = 0
    def clone(self) -> "CompilationCounter":
        return copy.deepcopy(self)