mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-14 06:55:01 +08:00
[BugFix] VLLM_DISABLE_COMPILE_CACHE=1 should disable all reads and writes from the cache (#20942)
Signed-off-by: Richard Zou <zou3519@gmail.com>
This commit is contained in:
parent
8cdc371217
commit
ba8c300018
@ -26,6 +26,30 @@ def test_use_cudagraphs_dynamic(monkeypatch):
|
|||||||
assert not vllm_config.compilation_config.use_cudagraph
|
assert not vllm_config.compilation_config.use_cudagraph
|
||||||
|
|
||||||
|
|
||||||
|
# NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
|
||||||
|
# on the state of the cache directory on the current machine, which
|
||||||
|
# may be influenced by other tests.
|
||||||
|
@pytest.mark.parametrize("val", ["1"])
|
||||||
|
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
|
||||||
|
assert vllm.envs.VLLM_USE_V1
|
||||||
|
|
||||||
|
# spawn means that the counters are in the same process.
|
||||||
|
monkeypatch.setenv('VLLM_WORKER_MULTIPROC_METHOD', "spawn")
|
||||||
|
monkeypatch.setenv('VLLM_DISABLE_COMPILE_CACHE', val)
|
||||||
|
|
||||||
|
compilation_config = {
|
||||||
|
"use_cudagraph": False, # speed things up a bit
|
||||||
|
}
|
||||||
|
with (
|
||||||
|
compilation_counter.expect(num_cache_entries_updated=0,
|
||||||
|
num_compiled_artifacts_saved=0),
|
||||||
|
# loading the model causes compilation (if enabled) to happen
|
||||||
|
vllm_runner('facebook/opt-125m',
|
||||||
|
compilation_config=compilation_config,
|
||||||
|
gpu_memory_utilization=0.4) as _):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("enabled", [True, False])
|
@pytest.mark.parametrize("enabled", [True, False])
|
||||||
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
|
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
|
||||||
assert vllm.envs.VLLM_USE_V1
|
assert vllm.envs.VLLM_USE_V1
|
||||||
|
|||||||
@ -183,9 +183,10 @@ class CompilerManager:
|
|||||||
assert compiled_graph is not None, "Failed to compile the graph"
|
assert compiled_graph is not None, "Failed to compile the graph"
|
||||||
|
|
||||||
# store the artifact in the cache
|
# store the artifact in the cache
|
||||||
if handle is not None:
|
if not envs.VLLM_DISABLE_COMPILE_CACHE and handle is not None:
|
||||||
self.cache[(runtime_shape, graph_index,
|
self.cache[(runtime_shape, graph_index,
|
||||||
self.compiler.name)] = handle
|
self.compiler.name)] = handle
|
||||||
|
compilation_counter.num_cache_entries_updated += 1
|
||||||
self.is_cache_updated = True
|
self.is_cache_updated = True
|
||||||
if graph_index == 0:
|
if graph_index == 0:
|
||||||
# adds some info logging for the first graph
|
# adds some info logging for the first graph
|
||||||
|
|||||||
@ -213,7 +213,9 @@ class InductorStandaloneAdaptor(CompilerInterface):
|
|||||||
# Save the compiled artifact to disk in the specified path
|
# Save the compiled artifact to disk in the specified path
|
||||||
assert key is not None
|
assert key is not None
|
||||||
path = os.path.join(self.cache_dir, key)
|
path = os.path.join(self.cache_dir, key)
|
||||||
|
if not envs.VLLM_DISABLE_COMPILE_CACHE:
|
||||||
compiled_graph.save(path=path, format="unpacked")
|
compiled_graph.save(path=path, format="unpacked")
|
||||||
|
compilation_counter.num_compiled_artifacts_saved += 1
|
||||||
return compiled_graph, (key, path)
|
return compiled_graph, (key, path)
|
||||||
|
|
||||||
def load(self,
|
def load(self,
|
||||||
|
|||||||
@ -23,6 +23,10 @@ class CompilationCounter:
|
|||||||
num_inductor_compiles: int = 0
|
num_inductor_compiles: int = 0
|
||||||
# EagerAdapter.compile calls
|
# EagerAdapter.compile calls
|
||||||
num_eager_compiles: int = 0
|
num_eager_compiles: int = 0
|
||||||
|
# The number of time vLLM's compiler cache entry was updated
|
||||||
|
num_cache_entries_updated: int = 0
|
||||||
|
# The number of standalone_compile compiled artifacts saved
|
||||||
|
num_compiled_artifacts_saved: int = 0
|
||||||
|
|
||||||
def clone(self) -> "CompilationCounter":
|
def clone(self) -> "CompilationCounter":
|
||||||
return copy.deepcopy(self)
|
return copy.deepcopy(self)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user