diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0a99994e243ae..8fc3587f7813c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -468,7 +468,9 @@ steps: # tests covered elsewhere. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + # However, find does not normally propagate error codes, so we combine it with xargs + # (using -0 for proper path handling) + - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -482,7 +484,9 @@ steps: # as it is a heavy test that is covered in other steps. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" + # However, find does not normally propagate error codes, so we combine it with xargs + # (using -0 for proper path handling) + - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 diff --git a/tests/compile/fullgraph/test_multimodal_compile.py b/tests/compile/fullgraph/test_multimodal_compile.py index e2897b2271b8f..621f6a51a918f 100644 --- a/tests/compile/fullgraph/test_multimodal_compile.py +++ b/tests/compile/fullgraph/test_multimodal_compile.py @@ -17,7 +17,6 @@ def test_compile(): # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073 @pytest.mark.forked @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda") -@pytest.mark.xfail def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch): """Test that Qwen2.5-VL vision submodules are compiled. diff --git a/tests/compile/test_compile_ranges.py b/tests/compile/test_compile_ranges.py index d849a8617ebd2..14ae8233f1131 100644 --- a/tests/compile/test_compile_ranges.py +++ b/tests/compile/test_compile_ranges.py @@ -80,6 +80,8 @@ def test_compile_ranges(use_fresh_inductor_cache): vllm_config = VllmConfig( scheduler_config=SchedulerConfig( max_num_batched_tokens=8192, + max_model_len=8192, + is_encoder_decoder=False, ), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, @@ -112,6 +114,8 @@ def test_compile_config_get_compile_ranges(): VllmConfig( scheduler_config=SchedulerConfig( max_num_batched_tokens=8192, + max_model_len=8192, + is_encoder_decoder=False, ), compilation_config=compilation_config, ) @@ -134,6 +138,8 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache): ) scheduler_config = SchedulerConfig( max_num_batched_tokens=8192, + max_model_len=8192, + is_encoder_decoder=False, ) torch.set_default_device("cuda") diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py index 6d0ba6b655031..6ed77b0085f51 100644 --- a/tests/compile/test_pass_manager.py +++ b/tests/compile/test_pass_manager.py @@ -5,9 +5,14 @@ import copy import pytest import torch -from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass +from vllm.compilation.inductor_pass import ( + CallableInductorPass, + InductorPass, + pass_context, +) from vllm.compilation.pass_manager import PostGradPassManager from vllm.config import ModelConfig, VllmConfig +from vllm.config.utils import Range # dummy custom pass that doesn't inherit @@ -42,35 +47,37 @@ class ProperPass(InductorPass): ], ) def test_pass_manager_uuid(callable): - # Some passes need dtype to be set - config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16)) + # Set the pass context as PassManager uuid uses it + with pass_context(Range(start=1, end=8)): + # Some passes need dtype to be set + config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16)) - pass_manager = PostGradPassManager() - pass_manager.configure(config) + pass_manager = PostGradPassManager() + pass_manager.configure(config) - # Check that UUID is different if the same pass is added 2x - pass_manager.add(callable) - uuid1 = pass_manager.uuid() - pass_manager.add(callable) - uuid2 = pass_manager.uuid() - assert uuid1 != uuid2 + # Check that UUID is different if the same pass is added 2x + pass_manager.add(callable) + uuid1 = pass_manager.uuid() + pass_manager.add(callable) + uuid2 = pass_manager.uuid() + assert uuid1 != uuid2 - # UUID should be the same as the original one, - # as we constructed in the same way. - pass_manager2 = PostGradPassManager() - pass_manager2.configure(config) - pass_manager2.add(callable) - assert uuid1 == pass_manager2.uuid() + # UUID should be the same as the original one, + # as we constructed in the same way. + pass_manager2 = PostGradPassManager() + pass_manager2.configure(config) + pass_manager2.add(callable) + assert uuid1 == pass_manager2.uuid() - # UUID should be different due to config change - config2 = copy.deepcopy(config) - config2.compilation_config.pass_config.fuse_norm_quant = ( - not config2.compilation_config.pass_config.fuse_norm_quant - ) - config2.compilation_config.pass_config.fuse_act_quant = ( - not config2.compilation_config.pass_config.fuse_act_quant - ) - pass_manager3 = PostGradPassManager() - pass_manager3.configure(config2) - pass_manager3.add(callable) - assert uuid1 != pass_manager3.uuid() + # UUID should be different due to config change + config2 = copy.deepcopy(config) + config2.compilation_config.pass_config.fuse_norm_quant = ( + not config2.compilation_config.pass_config.fuse_norm_quant + ) + config2.compilation_config.pass_config.fuse_act_quant = ( + not config2.compilation_config.pass_config.fuse_act_quant + ) + pass_manager3 = PostGradPassManager() + pass_manager3.configure(config2) + pass_manager3.add(callable) + assert uuid1 != pass_manager3.uuid() diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index 8159b817f637a..dbf154eeb86a4 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + import functools import hashlib import inspect @@ -8,15 +10,17 @@ import json import types from collections.abc import Callable from contextlib import contextmanager -from typing import Any +from typing import TYPE_CHECKING, Any import torch from torch import fx from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily -from vllm.config.utils import Range from vllm.utils.torch_utils import is_torch_equal_or_newer +if TYPE_CHECKING: + from vllm.config.utils import Range + if is_torch_equal_or_newer("2.6"): from torch._inductor.custom_graph_pass import CustomGraphPass else: diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 129b9b5deea31..a15c693767a51 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -53,8 +53,27 @@ class PiecewiseBackend: self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1 self.is_full_graph = total_piecewise_compiles == 1 + # TODO: we need to generalize encoder compilation to other models + self.is_encoder_compilation = vllm_backend.prefix in [ + "Qwen2_5_VisionPatchEmbed", + "Qwen2_5_VisionPatchMerger", + "Qwen2_5_VisionBlock", + ] self.compile_ranges = self.compilation_config.get_compile_ranges() + if self.is_encoder_compilation: + # For encoder compilation we use the max int32 value + # to set the upper bound of the compile ranges + max_int32 = 2**31 - 1 + last_compile_range = self.compile_ranges[-1] + assert ( + last_compile_range.end + == vllm_config.scheduler_config.max_num_batched_tokens + ) + self.compile_ranges[-1] = Range( + start=last_compile_range.start, end=max_int32 + ) + log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}" logger.debug_once(log_string)