From eaa2e51088d4daf36d47e566ad90e812f80e91b8 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Sat, 7 Jun 2025 20:56:12 -0400
Subject: [PATCH] [Bugfix] Re-enable use_cudagraph in vLLM v1 (#19299)

Signed-off-by: Richard Zou <zou3519@gmail.com>
---
 tests/compile/piecewise/test_simple.py     |  2 +-
 tests/compile/piecewise/test_toy_llama.py  |  6 +--
 tests/compile/test_config.py               | 43 ++++++++++++++++++++++
 vllm/compilation/counter.py                |  2 +-
 vllm/compilation/cuda_piecewise_backend.py |  2 +-
 vllm/config.py                             |  5 ++-
 6 files changed, 52 insertions(+), 8 deletions(-)
 create mode 100644 tests/compile/test_config.py

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 852aa44d47aa5..a050646e55209 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -95,7 +95,7 @@ def _test_simple_piecewise_compile(*, use_inductor):
             num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
             num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
             num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=
+            num_cudagraph_captured=
             6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
 
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 2464d7889861f..410c0101c99b9 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -327,7 +327,7 @@ def _test_toy_llama(*, use_inductor):
             num_piecewise_graphs_seen=0,
             num_piecewise_capturable_graphs_seen=0,
             num_backend_compilations=0,
-            num_cudagraph_caputured=0,
+            num_cudagraph_captured=0,
     ):
         outputs.append(
             run_model(llama_config, use_inductor=False, use_compile=False))
@@ -343,7 +343,7 @@ def _test_toy_llama(*, use_inductor):
             num_piecewise_graphs_seen=1,
             num_piecewise_capturable_graphs_seen=1,
             num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=
+            num_cudagraph_captured=
             2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
             **kwargs,
     ):
@@ -361,7 +361,7 @@ def _test_toy_llama(*, use_inductor):
             llama_config.num_layers,  # 1 + num_layers
             num_backend_compilations=1 +
             llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=2 *
+            num_cudagraph_captured=2 *
         (1 + llama_config.num_layers
          ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
new file mode 100644
index 0000000000000..4f0d6fc5aab99
--- /dev/null
+++ b/tests/compile/test_config.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+import vllm
+from vllm.compilation.counter import compilation_counter
+from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+                         set_current_vllm_config)
+
+from .piecewise.test_simple import SillyModel
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v1(monkeypatch):
+    """
+    TODO(rzou): The rest of tests/compile runs VLLM_USE_V1=0 right now,
+    I'll switch them over later.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '1')
+
+
+@pytest.mark.parametrize("enabled", [True, False])
+def test_use_cudagraphs(enabled):
+    assert vllm.envs.VLLM_USE_V1
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_cudagraph=enabled,
+        cudagraph_capture_sizes=[100],
+    ))
+    with set_current_vllm_config(vllm_config):
+        model = SillyModel(vllm_config=vllm_config, prefix='')
+
+    inputs = torch.randn(100, device="cuda")
+
+    with compilation_counter.expect(
+            num_graphs_seen=1,  # one graph for the model
+            num_cudagraph_captured=1 if enabled else 0,
+    ):
+        # first run is warmup
+        model(inputs)
+        # second run does CUDAGraphs recording (if enabled)
+        model(inputs)
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
index c584c103f4410..165347cfccef7 100644
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -15,7 +15,7 @@ class CompilationCounter:
     # not including the splitting ops
     num_piecewise_capturable_graphs_seen: int = 0
     num_backend_compilations: int = 0
-    num_cudagraph_caputured: int = 0
+    num_cudagraph_captured: int = 0
     # InductorAdapter.compile calls
     num_inductor_compiles: int = 0
     # EagerAdapter.compile calls
diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py
index 8bf957368f6ab..993def49af700 100644
--- a/vllm/compilation/cuda_piecewise_backend.py
+++ b/vllm/compilation/cuda_piecewise_backend.py
@@ -193,7 +193,7 @@ class CUDAPiecewiseBackend:
             entry.output = weak_ref_tensors(output)
             entry.cudagraph = cudagraph
 
-            compilation_counter.num_cudagraph_caputured += 1
+            compilation_counter.num_cudagraph_captured += 1
 
             # important: we need to return the output, rather than
             # the weak ref of the output, so that pytorch can correctly
diff --git a/vllm/config.py b/vllm/config.py
index 31a1d208eaa75..d643daa5c58ec 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3918,12 +3918,14 @@ class CompilationConfig:
     constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
 
     # CudaGraph compilation
-    use_cudagraph: bool = False
+    use_cudagraph: bool = envs.VLLM_USE_V1
     """Whether to use cudagraph inside compilation.
     - False: cudagraph inside compilation is not used.
     - True: cudagraph inside compilation is used. It requires
         that all input buffers have fixed addresses, and all
         splitting ops write their outputs to input buffers.
+    In the vLLM V1 Engine, this flag only applies for
+    CompilationLevel.PIECEWISE (aka -O3).
     Note that this is orthogonal to the cudagraph capture logic
     outside of compilation.
     TODO: move outside cudagraph logic into compilation.
@@ -4425,7 +4427,6 @@ class VllmConfig:
             # FIXME(rob): Add function to set all of these.
             if not self.compilation_config.custom_ops:
                 self.compilation_config.custom_ops = ["none"]
-            self.compilation_config.use_cudagraph = True
             self.compilation_config.cudagraph_num_of_warmups = 1
             self.compilation_config.pass_config.enable_fusion = False
             self.compilation_config.pass_config.enable_noop = False