mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-24 15:11:25 +08:00
[BugFix] Fix non detected failing tests (#30277)
Signed-off-by: ilmarkov <markovilya197@gmail.com>
This commit is contained in:
parent
804e3468c0
commit
0b6a8a304c
@ -468,7 +468,9 @@ steps:
|
|||||||
# tests covered elsewhere.
|
# tests covered elsewhere.
|
||||||
# Use `find` to launch multiple instances of pytest so that
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
|
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||||
|
# (using -0 for proper path handling)
|
||||||
|
- "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@ -482,7 +484,9 @@ steps:
|
|||||||
# as it is a heavy test that is covered in other steps.
|
# as it is a heavy test that is covered in other steps.
|
||||||
# Use `find` to launch multiple instances of pytest so that
|
# Use `find` to launch multiple instances of pytest so that
|
||||||
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
|
||||||
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
|
# However, find does not normally propagate error codes, so we combine it with xargs
|
||||||
|
# (using -0 for proper path handling)
|
||||||
|
- "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 27min
|
- label: PyTorch Fullgraph Test # 27min
|
||||||
timeout_in_minutes: 40
|
timeout_in_minutes: 40
|
||||||
|
|||||||
@ -17,7 +17,6 @@ def test_compile():
|
|||||||
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
||||||
@pytest.mark.forked
|
@pytest.mark.forked
|
||||||
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
|
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
|
def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
|
||||||
"""Test that Qwen2.5-VL vision submodules are compiled.
|
"""Test that Qwen2.5-VL vision submodules are compiled.
|
||||||
|
|
||||||
|
|||||||
@ -80,6 +80,8 @@ def test_compile_ranges(use_fresh_inductor_cache):
|
|||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
scheduler_config=SchedulerConfig(
|
scheduler_config=SchedulerConfig(
|
||||||
max_num_batched_tokens=8192,
|
max_num_batched_tokens=8192,
|
||||||
|
max_model_len=8192,
|
||||||
|
is_encoder_decoder=False,
|
||||||
),
|
),
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
mode=CompilationMode.VLLM_COMPILE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
@ -112,6 +114,8 @@ def test_compile_config_get_compile_ranges():
|
|||||||
VllmConfig(
|
VllmConfig(
|
||||||
scheduler_config=SchedulerConfig(
|
scheduler_config=SchedulerConfig(
|
||||||
max_num_batched_tokens=8192,
|
max_num_batched_tokens=8192,
|
||||||
|
max_model_len=8192,
|
||||||
|
is_encoder_decoder=False,
|
||||||
),
|
),
|
||||||
compilation_config=compilation_config,
|
compilation_config=compilation_config,
|
||||||
)
|
)
|
||||||
@ -134,6 +138,8 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
|
|||||||
)
|
)
|
||||||
scheduler_config = SchedulerConfig(
|
scheduler_config = SchedulerConfig(
|
||||||
max_num_batched_tokens=8192,
|
max_num_batched_tokens=8192,
|
||||||
|
max_model_len=8192,
|
||||||
|
is_encoder_decoder=False,
|
||||||
)
|
)
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
|
|
||||||
|
|||||||
@ -5,9 +5,14 @@ import copy
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
|
from vllm.compilation.inductor_pass import (
|
||||||
|
CallableInductorPass,
|
||||||
|
InductorPass,
|
||||||
|
pass_context,
|
||||||
|
)
|
||||||
from vllm.compilation.pass_manager import PostGradPassManager
|
from vllm.compilation.pass_manager import PostGradPassManager
|
||||||
from vllm.config import ModelConfig, VllmConfig
|
from vllm.config import ModelConfig, VllmConfig
|
||||||
|
from vllm.config.utils import Range
|
||||||
|
|
||||||
|
|
||||||
# dummy custom pass that doesn't inherit
|
# dummy custom pass that doesn't inherit
|
||||||
@ -42,35 +47,37 @@ class ProperPass(InductorPass):
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_pass_manager_uuid(callable):
|
def test_pass_manager_uuid(callable):
|
||||||
# Some passes need dtype to be set
|
# Set the pass context as PassManager uuid uses it
|
||||||
config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
|
with pass_context(Range(start=1, end=8)):
|
||||||
|
# Some passes need dtype to be set
|
||||||
|
config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
|
||||||
|
|
||||||
pass_manager = PostGradPassManager()
|
pass_manager = PostGradPassManager()
|
||||||
pass_manager.configure(config)
|
pass_manager.configure(config)
|
||||||
|
|
||||||
# Check that UUID is different if the same pass is added 2x
|
# Check that UUID is different if the same pass is added 2x
|
||||||
pass_manager.add(callable)
|
pass_manager.add(callable)
|
||||||
uuid1 = pass_manager.uuid()
|
uuid1 = pass_manager.uuid()
|
||||||
pass_manager.add(callable)
|
pass_manager.add(callable)
|
||||||
uuid2 = pass_manager.uuid()
|
uuid2 = pass_manager.uuid()
|
||||||
assert uuid1 != uuid2
|
assert uuid1 != uuid2
|
||||||
|
|
||||||
# UUID should be the same as the original one,
|
# UUID should be the same as the original one,
|
||||||
# as we constructed in the same way.
|
# as we constructed in the same way.
|
||||||
pass_manager2 = PostGradPassManager()
|
pass_manager2 = PostGradPassManager()
|
||||||
pass_manager2.configure(config)
|
pass_manager2.configure(config)
|
||||||
pass_manager2.add(callable)
|
pass_manager2.add(callable)
|
||||||
assert uuid1 == pass_manager2.uuid()
|
assert uuid1 == pass_manager2.uuid()
|
||||||
|
|
||||||
# UUID should be different due to config change
|
# UUID should be different due to config change
|
||||||
config2 = copy.deepcopy(config)
|
config2 = copy.deepcopy(config)
|
||||||
config2.compilation_config.pass_config.fuse_norm_quant = (
|
config2.compilation_config.pass_config.fuse_norm_quant = (
|
||||||
not config2.compilation_config.pass_config.fuse_norm_quant
|
not config2.compilation_config.pass_config.fuse_norm_quant
|
||||||
)
|
)
|
||||||
config2.compilation_config.pass_config.fuse_act_quant = (
|
config2.compilation_config.pass_config.fuse_act_quant = (
|
||||||
not config2.compilation_config.pass_config.fuse_act_quant
|
not config2.compilation_config.pass_config.fuse_act_quant
|
||||||
)
|
)
|
||||||
pass_manager3 = PostGradPassManager()
|
pass_manager3 = PostGradPassManager()
|
||||||
pass_manager3.configure(config2)
|
pass_manager3.configure(config2)
|
||||||
pass_manager3.add(callable)
|
pass_manager3.add(callable)
|
||||||
assert uuid1 != pass_manager3.uuid()
|
assert uuid1 != pass_manager3.uuid()
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import functools
|
import functools
|
||||||
import hashlib
|
import hashlib
|
||||||
import inspect
|
import inspect
|
||||||
@ -8,15 +10,17 @@ import json
|
|||||||
import types
|
import types
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from typing import Any
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import fx
|
from torch import fx
|
||||||
from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily
|
from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily
|
||||||
|
|
||||||
from vllm.config.utils import Range
|
|
||||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from vllm.config.utils import Range
|
||||||
|
|
||||||
if is_torch_equal_or_newer("2.6"):
|
if is_torch_equal_or_newer("2.6"):
|
||||||
from torch._inductor.custom_graph_pass import CustomGraphPass
|
from torch._inductor.custom_graph_pass import CustomGraphPass
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -53,8 +53,27 @@ class PiecewiseBackend:
|
|||||||
self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1
|
self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1
|
||||||
|
|
||||||
self.is_full_graph = total_piecewise_compiles == 1
|
self.is_full_graph = total_piecewise_compiles == 1
|
||||||
|
# TODO: we need to generalize encoder compilation to other models
|
||||||
|
self.is_encoder_compilation = vllm_backend.prefix in [
|
||||||
|
"Qwen2_5_VisionPatchEmbed",
|
||||||
|
"Qwen2_5_VisionPatchMerger",
|
||||||
|
"Qwen2_5_VisionBlock",
|
||||||
|
]
|
||||||
|
|
||||||
self.compile_ranges = self.compilation_config.get_compile_ranges()
|
self.compile_ranges = self.compilation_config.get_compile_ranges()
|
||||||
|
if self.is_encoder_compilation:
|
||||||
|
# For encoder compilation we use the max int32 value
|
||||||
|
# to set the upper bound of the compile ranges
|
||||||
|
max_int32 = 2**31 - 1
|
||||||
|
last_compile_range = self.compile_ranges[-1]
|
||||||
|
assert (
|
||||||
|
last_compile_range.end
|
||||||
|
== vllm_config.scheduler_config.max_num_batched_tokens
|
||||||
|
)
|
||||||
|
self.compile_ranges[-1] = Range(
|
||||||
|
start=last_compile_range.start, end=max_int32
|
||||||
|
)
|
||||||
|
|
||||||
log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}"
|
log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}"
|
||||||
logger.debug_once(log_string)
|
logger.debug_once(log_string)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user