diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index fd2b1866e62e..a1e5127ebeeb 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -20,7 +20,6 @@ class TestSetting: tp_size: int attn_backend: str method: str - fullgraph: bool # we cannot afford testing the full Cartesian product @@ -36,7 +35,6 @@ class TestSetting: tp_size=2, attn_backend="FLASH_ATTN", method="generate", - fullgraph=True, ), # llama model with quantization TestSetting( @@ -46,7 +44,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="generate", - fullgraph=True, ), # MoE model TestSetting( @@ -56,7 +53,6 @@ class TestSetting: tp_size=2, attn_backend="FLASH_ATTN", method="generate", - fullgraph=True, ), # embedding model TestSetting( @@ -73,7 +69,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="encode", - fullgraph=True, ), TestSetting( model="BAAI/bge-base-en-v1.5", @@ -82,7 +77,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="encode", - fullgraph=True, ), # vision language model TestSetting( @@ -92,7 +86,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="generate_with_image", - fullgraph=False, ), ], ) @@ -109,9 +102,8 @@ def test_compile_correctness( tp_size = test_setting.tp_size attn_backend = test_setting.attn_backend method = test_setting.method - fullgraph = test_setting.fullgraph - if cuda_device_count_stateless() != pp_size * tp_size: - pytest.skip(f"Need exactly {pp_size}*{tp_size} CUDA gpus but got " + if cuda_device_count_stateless() < pp_size * tp_size: + pytest.skip(f"Need at least {pp_size}*{tp_size} CUDA gpus but got " f"{cuda_device_count_stateless()}") with monkeypatch.context() as m: @@ -149,9 +141,5 @@ def test_compile_correctness( ]: all_args.append(final_args + [f"-O{level}"]) all_envs.append({}) - if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: - # "DYNAMO_ONCE" will always use fullgraph - all_envs[-1][ - "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore compare_all_settings(model, all_args * 3, all_envs, method=method) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 84178344a5f3..3439a1b29038 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -79,9 +79,7 @@ def test_full_graph( ): model, model_kwargs = model_info - with monkeypatch.context() as m: - # make sure these models can be captured in full graph mode - m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") + with monkeypatch.context(): print(f"MODEL={model}") run_model(optimization_level, model, model_kwargs) diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 96d4eae2ee9a..930e4d27b410 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -10,7 +10,6 @@ from typing import Callable, Optional import torch -import vllm.envs as envs from vllm.config import (CompilationLevel, CUDAGraphMode, get_current_vllm_config) from vllm.logger import init_logger @@ -47,11 +46,10 @@ class TorchCompileWrapperWithCustomDispatcher: options = get_current_vllm_config( ).compilation_config.inductor_compile_config - compiled_callable = torch.compile( - self.forward, - fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=backend, - options=options) + compiled_callable = torch.compile(self.forward, + fullgraph=True, + backend=backend, + options=options) self.compiled_callable = compiled_callable self.original_code_object = self.__class__.forward.__code__ diff --git a/vllm/envs.py b/vllm/envs.py index 294a0b920fb7..3991a789d80f 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -434,11 +434,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)), - # Internal flag to enable Dynamo fullgraph capture - "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": - lambda: bool( - os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"), - # Feature flag to enable/disable Inductor standalone compile. # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is # enabled by default. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3539f7561205..dffadd1d769b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2602,9 +2602,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): backend = self.vllm_config.compilation_config.init_backend( self.vllm_config) compilation_counter.dynamo_as_is_count += 1 - self.model.compile( - fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=backend) + self.model.compile(fullgraph=True, backend=backend) return # for other compilation levels, cudagraph behavior is controlled by # CudagraphWraper and CudagraphDispatcher of vllm. diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index c91c871766cf..f662f5a85eff 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -18,7 +18,6 @@ import torch.distributed import torch.nn as nn from tqdm.auto import tqdm -import vllm.envs as envs from vllm.attention import AttentionMetadata, get_attn_backend from vllm.attention.backends.abstract import AttentionState from vllm.attention.backends.utils import CommonAttentionState @@ -1099,10 +1098,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): backend = self.vllm_config.compilation_config.init_backend( self.vllm_config) compilation_counter.dynamo_as_is_count += 1 - self.model = torch.compile( - self.model, - fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=backend) + self.model = torch.compile(self.model, + fullgraph=True, + backend=backend) def get_model(self) -> nn.Module: return self.model