From eaa2e51088d4daf36d47e566ad90e812f80e91b8 Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Sat, 7 Jun 2025 20:56:12 -0400 Subject: [PATCH] [Bugfix] Re-enable use_cudagraph in vLLM v1 (#19299) Signed-off-by: Richard Zou --- tests/compile/piecewise/test_simple.py | 2 +- tests/compile/piecewise/test_toy_llama.py | 6 +-- tests/compile/test_config.py | 43 ++++++++++++++++++++++ vllm/compilation/counter.py | 2 +- vllm/compilation/cuda_piecewise_backend.py | 2 +- vllm/config.py | 5 ++- 6 files changed, 52 insertions(+), 8 deletions(-) create mode 100644 tests/compile/test_config.py diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 852aa44d47aa5..a050646e55209 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -95,7 +95,7 @@ def _test_simple_piecewise_compile(*, use_inductor): num_piecewise_graphs_seen=5, # 2 * num_layers + 1 num_piecewise_capturable_graphs_seen=3, # 1 + num_layers num_backend_compilations=3, # num_piecewise_capturable_graphs_seen - num_cudagraph_caputured= + num_cudagraph_captured= 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ): diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 2464d7889861f..410c0101c99b9 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -327,7 +327,7 @@ def _test_toy_llama(*, use_inductor): num_piecewise_graphs_seen=0, num_piecewise_capturable_graphs_seen=0, num_backend_compilations=0, - num_cudagraph_caputured=0, + num_cudagraph_captured=0, ): outputs.append( run_model(llama_config, use_inductor=False, use_compile=False)) @@ -343,7 +343,7 @@ def _test_toy_llama(*, use_inductor): num_piecewise_graphs_seen=1, num_piecewise_capturable_graphs_seen=1, num_backend_compilations=1, # num_piecewise_capturable_graphs_seen - num_cudagraph_caputured= + num_cudagraph_captured= 2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen **kwargs, ): @@ -361,7 +361,7 @@ def _test_toy_llama(*, use_inductor): llama_config.num_layers, # 1 + num_layers num_backend_compilations=1 + llama_config.num_layers, # num_piecewise_capturable_graphs_seen - num_cudagraph_caputured=2 * + num_cudagraph_captured=2 * (1 + llama_config.num_layers ), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ): diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py new file mode 100644 index 0000000000000..4f0d6fc5aab99 --- /dev/null +++ b/tests/compile/test_config.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch + +import vllm +from vllm.compilation.counter import compilation_counter +from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, + set_current_vllm_config) + +from .piecewise.test_simple import SillyModel + + +@pytest.fixture(scope="function", autouse=True) +def use_v1(monkeypatch): + """ + TODO(rzou): The rest of tests/compile runs VLLM_USE_V1=0 right now, + I'll switch them over later. + """ + monkeypatch.setenv('VLLM_USE_V1', '1') + + +@pytest.mark.parametrize("enabled", [True, False]) +def test_use_cudagraphs(enabled): + assert vllm.envs.VLLM_USE_V1 + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=enabled, + cudagraph_capture_sizes=[100], + )) + with set_current_vllm_config(vllm_config): + model = SillyModel(vllm_config=vllm_config, prefix='') + + inputs = torch.randn(100, device="cuda") + + with compilation_counter.expect( + num_graphs_seen=1, # one graph for the model + num_cudagraph_captured=1 if enabled else 0, + ): + # first run is warmup + model(inputs) + # second run does CUDAGraphs recording (if enabled) + model(inputs) diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py index c584c103f4410..165347cfccef7 100644 --- a/vllm/compilation/counter.py +++ b/vllm/compilation/counter.py @@ -15,7 +15,7 @@ class CompilationCounter: # not including the splitting ops num_piecewise_capturable_graphs_seen: int = 0 num_backend_compilations: int = 0 - num_cudagraph_caputured: int = 0 + num_cudagraph_captured: int = 0 # InductorAdapter.compile calls num_inductor_compiles: int = 0 # EagerAdapter.compile calls diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py index 8bf957368f6ab..993def49af700 100644 --- a/vllm/compilation/cuda_piecewise_backend.py +++ b/vllm/compilation/cuda_piecewise_backend.py @@ -193,7 +193,7 @@ class CUDAPiecewiseBackend: entry.output = weak_ref_tensors(output) entry.cudagraph = cudagraph - compilation_counter.num_cudagraph_caputured += 1 + compilation_counter.num_cudagraph_captured += 1 # important: we need to return the output, rather than # the weak ref of the output, so that pytorch can correctly diff --git a/vllm/config.py b/vllm/config.py index 31a1d208eaa75..d643daa5c58ec 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3918,12 +3918,14 @@ class CompilationConfig: constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`.""" # CudaGraph compilation - use_cudagraph: bool = False + use_cudagraph: bool = envs.VLLM_USE_V1 """Whether to use cudagraph inside compilation. - False: cudagraph inside compilation is not used. - True: cudagraph inside compilation is used. It requires that all input buffers have fixed addresses, and all splitting ops write their outputs to input buffers. + In the vLLM V1 Engine, this flag only applies for + CompilationLevel.PIECEWISE (aka -O3). Note that this is orthogonal to the cudagraph capture logic outside of compilation. TODO: move outside cudagraph logic into compilation. @@ -4425,7 +4427,6 @@ class VllmConfig: # FIXME(rob): Add function to set all of these. if not self.compilation_config.custom_ops: self.compilation_config.custom_ops = ["none"] - self.compilation_config.use_cudagraph = True self.compilation_config.cudagraph_num_of_warmups = 1 self.compilation_config.pass_config.enable_fusion = False self.compilation_config.pass_config.enable_noop = False