[Bugfix] Re-enable use_cudagraph in vLLM v1 (#19299)

Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-12-14 18:38:38 +08:00 · 2025-06-07 20:56:12 -04:00 · 2025-06-07 20:56:12 -04:00 · eaa2e51088
commit eaa2e51088
parent d77f7fb871
6 changed files with 52 additions and 8 deletions
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@ -95,7 +95,7 @@ def _test_simple_piecewise_compile(*, use_inductor):
            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=
+            num_cudagraph_captured=
            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
    ):
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@ -327,7 +327,7 @@ def _test_toy_llama(*, use_inductor):
            num_piecewise_graphs_seen=0,
            num_piecewise_capturable_graphs_seen=0,
            num_backend_compilations=0,
-            num_cudagraph_caputured=0,
+            num_cudagraph_captured=0,
    ):
        outputs.append(
            run_model(llama_config, use_inductor=False, use_compile=False))
@ -343,7 +343,7 @@ def _test_toy_llama(*, use_inductor):
            num_piecewise_graphs_seen=1,
            num_piecewise_capturable_graphs_seen=1,
            num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=
+            num_cudagraph_captured=
            2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
            **kwargs,
    ):
@ -361,7 +361,7 @@ def _test_toy_llama(*, use_inductor):
            llama_config.num_layers,  # 1 + num_layers
            num_backend_compilations=1 +
            llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=2 *
+            num_cudagraph_captured=2 *
        (1 + llama_config.num_layers
         ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
    ):
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@ -0,0 +1,43 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 import vllm
 from vllm.compilation.counter import compilation_counter
 from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
                         set_current_vllm_config)
 from .piecewise.test_simple import SillyModel
@pytest.fixture(scope="function", autouse=True)
 def use_v1(monkeypatch):
    """
    TODO(rzou): The rest of tests/compile runs VLLM_USE_V1=0 right now,
    I'll switch them over later.
    """
    monkeypatch.setenv('VLLM_USE_V1', '1')
@pytest.mark.parametrize("enabled", [True, False])
 def test_use_cudagraphs(enabled):
    assert vllm.envs.VLLM_USE_V1
    vllm_config = VllmConfig(compilation_config=CompilationConfig(
        level=CompilationLevel.PIECEWISE,
        use_cudagraph=enabled,
        cudagraph_capture_sizes=[100],
    ))
    with set_current_vllm_config(vllm_config):
        model = SillyModel(vllm_config=vllm_config, prefix='')
    inputs = torch.randn(100, device="cuda")
    with compilation_counter.expect(
            num_graphs_seen=1,  # one graph for the model
            num_cudagraph_captured=1 if enabled else 0,
    ):
        # first run is warmup
        model(inputs)
        # second run does CUDAGraphs recording (if enabled)
        model(inputs)
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@ -15,7 +15,7 @@ class CompilationCounter:
    # not including the splitting ops
    num_piecewise_capturable_graphs_seen: int = 0
    num_backend_compilations: int = 0
-    num_cudagraph_caputured: int = 0
+    num_cudagraph_captured: int = 0
    # InductorAdapter.compile calls
    num_inductor_compiles: int = 0
    # EagerAdapter.compile calls
--- a/vllm/compilation/cuda_piecewise_backend.py
+++ b/vllm/compilation/cuda_piecewise_backend.py
@ -193,7 +193,7 @@ class CUDAPiecewiseBackend:
            entry.output = weak_ref_tensors(output)
            entry.cudagraph = cudagraph
-            compilation_counter.num_cudagraph_caputured += 1
+            compilation_counter.num_cudagraph_captured += 1
            # important: we need to return the output, rather than
            # the weak ref of the output, so that pytorch can correctly
--- a/vllm/config.py
+++ b/vllm/config.py
@ -3918,12 +3918,14 @@ class CompilationConfig:
    constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
    # CudaGraph compilation
-    use_cudagraph: bool = False
+    use_cudagraph: bool = envs.VLLM_USE_V1
    """Whether to use cudagraph inside compilation.
    - False: cudagraph inside compilation is not used.
    - True: cudagraph inside compilation is used. It requires
        that all input buffers have fixed addresses, and all
        splitting ops write their outputs to input buffers.
    In the vLLM V1 Engine, this flag only applies for
    CompilationLevel.PIECEWISE (aka -O3).
    Note that this is orthogonal to the cudagraph capture logic
    outside of compilation.
    TODO: move outside cudagraph logic into compilation.
@ -4425,7 +4427,6 @@ class VllmConfig:
            # FIXME(rob): Add function to set all of these.
            if not self.compilation_config.custom_ops:
                self.compilation_config.custom_ops = ["none"]
            self.compilation_config.use_cudagraph = True
            self.compilation_config.cudagraph_num_of_warmups = 1
            self.compilation_config.pass_config.enable_fusion = False
            self.compilation_config.pass_config.enable_noop = False