[Bugfix] Remove VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE #2969 (#25090)

Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-05-04 05:24:30 +08:00 · 2025-09-19 17:27:21 -07:00 · 2025-09-19 17:27:21 -07:00 · bc997c18ca
commit bc997c18ca
parent d55c6010ac
6 changed files with 11 additions and 36 deletions
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@ -20,7 +20,6 @@ class TestSetting:
    tp_size: int
    attn_backend: str
    method: str
    fullgraph: bool
 # we cannot afford testing the full Cartesian product
@ -36,7 +35,6 @@ class TestSetting:
            tp_size=2,
            attn_backend="FLASH_ATTN",
            method="generate",
            fullgraph=True,
        ),
        # llama model with quantization
        TestSetting(
@ -46,7 +44,6 @@ class TestSetting:
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="generate",
            fullgraph=True,
        ),
        # MoE model
        TestSetting(
@ -56,7 +53,6 @@ class TestSetting:
            tp_size=2,
            attn_backend="FLASH_ATTN",
            method="generate",
            fullgraph=True,
        ),
        # embedding model
        TestSetting(
@ -73,7 +69,6 @@ class TestSetting:
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="encode",
            fullgraph=True,
        ),
        TestSetting(
            model="BAAI/bge-base-en-v1.5",
@ -82,7 +77,6 @@ class TestSetting:
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="encode",
            fullgraph=True,
        ),
        # vision language model
        TestSetting(
@ -92,7 +86,6 @@ class TestSetting:
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="generate_with_image",
            fullgraph=False,
        ),
    ],
 )
@ -109,9 +102,8 @@ def test_compile_correctness(
    tp_size = test_setting.tp_size
    attn_backend = test_setting.attn_backend
    method = test_setting.method
-    fullgraph = test_setting.fullgraph
+    if cuda_device_count_stateless() < pp_size * tp_size:
-    if cuda_device_count_stateless() != pp_size * tp_size:
+        pytest.skip(f"Need at least {pp_size}*{tp_size} CUDA gpus but got "
        pytest.skip(f"Need exactly {pp_size}*{tp_size} CUDA gpus but got "
                    f"{cuda_device_count_stateless()}")
    with monkeypatch.context() as m:
@ -149,9 +141,5 @@ def test_compile_correctness(
        ]:
            all_args.append(final_args + [f"-O{level}"])
            all_envs.append({})
            if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
                # "DYNAMO_ONCE" will always use fullgraph
                all_envs[-1][
                    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
        compare_all_settings(model, all_args * 3, all_envs, method=method)
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@ -79,9 +79,7 @@ def test_full_graph(
 ):
    model, model_kwargs = model_info
-    with monkeypatch.context() as m:
+    with monkeypatch.context():
        # make sure these models can be captured in full graph mode
        m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
        print(f"MODEL={model}")
        run_model(optimization_level, model, model_kwargs)
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@ -10,7 +10,6 @@ from typing import Callable, Optional
 import torch
 import vllm.envs as envs
 from vllm.config import (CompilationLevel, CUDAGraphMode,
                         get_current_vllm_config)
 from vllm.logger import init_logger
@ -47,11 +46,10 @@ class TorchCompileWrapperWithCustomDispatcher:
                options = get_current_vllm_config(
                ).compilation_config.inductor_compile_config
-            compiled_callable = torch.compile(
+            compiled_callable = torch.compile(self.forward,
-                self.forward,
+                                              fullgraph=True,
-                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                                              backend=backend,
-                backend=backend,
+                                              options=options)
                options=options)
        self.compiled_callable = compiled_callable
        self.original_code_object = self.__class__.forward.__code__
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -434,11 +434,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_FLASH_ATTN_VERSION":
    lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)),
    # Internal flag to enable Dynamo fullgraph capture
    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":
    lambda: bool(
        os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
    # Feature flag to enable/disable Inductor standalone compile.
    # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
    # enabled by default.
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -2602,9 +2602,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            backend = self.vllm_config.compilation_config.init_backend(
                self.vllm_config)
            compilation_counter.dynamo_as_is_count += 1
-            self.model.compile(
+            self.model.compile(fullgraph=True, backend=backend)
                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
                backend=backend)
            return
        # for other compilation levels, cudagraph behavior is controlled by
        # CudagraphWraper and CudagraphDispatcher of vllm.
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@ -18,7 +18,6 @@ import torch.distributed
 import torch.nn as nn
 from tqdm.auto import tqdm
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
 from vllm.attention.backends.utils import CommonAttentionState
@ -1099,10 +1098,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
            backend = self.vllm_config.compilation_config.init_backend(
                self.vllm_config)
            compilation_counter.dynamo_as_is_count += 1
-            self.model = torch.compile(
+            self.model = torch.compile(self.model,
-                self.model,
+                                       fullgraph=True,
-                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                                       backend=backend)
                backend=backend)
    def get_model(self) -> nn.Module:
        return self.model