mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-04 05:24:30 +08:00
Signed-off-by: Lucas Kabela <lucaskabela@meta.com> Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
d55c6010ac
commit
bc997c18ca
@ -20,7 +20,6 @@ class TestSetting:
|
|||||||
tp_size: int
|
tp_size: int
|
||||||
attn_backend: str
|
attn_backend: str
|
||||||
method: str
|
method: str
|
||||||
fullgraph: bool
|
|
||||||
|
|
||||||
|
|
||||||
# we cannot afford testing the full Cartesian product
|
# we cannot afford testing the full Cartesian product
|
||||||
@ -36,7 +35,6 @@ class TestSetting:
|
|||||||
tp_size=2,
|
tp_size=2,
|
||||||
attn_backend="FLASH_ATTN",
|
attn_backend="FLASH_ATTN",
|
||||||
method="generate",
|
method="generate",
|
||||||
fullgraph=True,
|
|
||||||
),
|
),
|
||||||
# llama model with quantization
|
# llama model with quantization
|
||||||
TestSetting(
|
TestSetting(
|
||||||
@ -46,7 +44,6 @@ class TestSetting:
|
|||||||
tp_size=1,
|
tp_size=1,
|
||||||
attn_backend="FLASH_ATTN",
|
attn_backend="FLASH_ATTN",
|
||||||
method="generate",
|
method="generate",
|
||||||
fullgraph=True,
|
|
||||||
),
|
),
|
||||||
# MoE model
|
# MoE model
|
||||||
TestSetting(
|
TestSetting(
|
||||||
@ -56,7 +53,6 @@ class TestSetting:
|
|||||||
tp_size=2,
|
tp_size=2,
|
||||||
attn_backend="FLASH_ATTN",
|
attn_backend="FLASH_ATTN",
|
||||||
method="generate",
|
method="generate",
|
||||||
fullgraph=True,
|
|
||||||
),
|
),
|
||||||
# embedding model
|
# embedding model
|
||||||
TestSetting(
|
TestSetting(
|
||||||
@ -73,7 +69,6 @@ class TestSetting:
|
|||||||
tp_size=1,
|
tp_size=1,
|
||||||
attn_backend="FLASH_ATTN",
|
attn_backend="FLASH_ATTN",
|
||||||
method="encode",
|
method="encode",
|
||||||
fullgraph=True,
|
|
||||||
),
|
),
|
||||||
TestSetting(
|
TestSetting(
|
||||||
model="BAAI/bge-base-en-v1.5",
|
model="BAAI/bge-base-en-v1.5",
|
||||||
@ -82,7 +77,6 @@ class TestSetting:
|
|||||||
tp_size=1,
|
tp_size=1,
|
||||||
attn_backend="FLASH_ATTN",
|
attn_backend="FLASH_ATTN",
|
||||||
method="encode",
|
method="encode",
|
||||||
fullgraph=True,
|
|
||||||
),
|
),
|
||||||
# vision language model
|
# vision language model
|
||||||
TestSetting(
|
TestSetting(
|
||||||
@ -92,7 +86,6 @@ class TestSetting:
|
|||||||
tp_size=1,
|
tp_size=1,
|
||||||
attn_backend="FLASH_ATTN",
|
attn_backend="FLASH_ATTN",
|
||||||
method="generate_with_image",
|
method="generate_with_image",
|
||||||
fullgraph=False,
|
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@ -109,9 +102,8 @@ def test_compile_correctness(
|
|||||||
tp_size = test_setting.tp_size
|
tp_size = test_setting.tp_size
|
||||||
attn_backend = test_setting.attn_backend
|
attn_backend = test_setting.attn_backend
|
||||||
method = test_setting.method
|
method = test_setting.method
|
||||||
fullgraph = test_setting.fullgraph
|
if cuda_device_count_stateless() < pp_size * tp_size:
|
||||||
if cuda_device_count_stateless() != pp_size * tp_size:
|
pytest.skip(f"Need at least {pp_size}*{tp_size} CUDA gpus but got "
|
||||||
pytest.skip(f"Need exactly {pp_size}*{tp_size} CUDA gpus but got "
|
|
||||||
f"{cuda_device_count_stateless()}")
|
f"{cuda_device_count_stateless()}")
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
@ -149,9 +141,5 @@ def test_compile_correctness(
|
|||||||
]:
|
]:
|
||||||
all_args.append(final_args + [f"-O{level}"])
|
all_args.append(final_args + [f"-O{level}"])
|
||||||
all_envs.append({})
|
all_envs.append({})
|
||||||
if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
|
|
||||||
# "DYNAMO_ONCE" will always use fullgraph
|
|
||||||
all_envs[-1][
|
|
||||||
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
|
|
||||||
|
|
||||||
compare_all_settings(model, all_args * 3, all_envs, method=method)
|
compare_all_settings(model, all_args * 3, all_envs, method=method)
|
||||||
|
|||||||
@ -79,9 +79,7 @@ def test_full_graph(
|
|||||||
):
|
):
|
||||||
model, model_kwargs = model_info
|
model, model_kwargs = model_info
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context():
|
||||||
# make sure these models can be captured in full graph mode
|
|
||||||
m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
|
|
||||||
print(f"MODEL={model}")
|
print(f"MODEL={model}")
|
||||||
|
|
||||||
run_model(optimization_level, model, model_kwargs)
|
run_model(optimization_level, model, model_kwargs)
|
||||||
|
|||||||
@ -10,7 +10,6 @@ from typing import Callable, Optional
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
import vllm.envs as envs
|
|
||||||
from vllm.config import (CompilationLevel, CUDAGraphMode,
|
from vllm.config import (CompilationLevel, CUDAGraphMode,
|
||||||
get_current_vllm_config)
|
get_current_vllm_config)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
@ -47,11 +46,10 @@ class TorchCompileWrapperWithCustomDispatcher:
|
|||||||
options = get_current_vllm_config(
|
options = get_current_vllm_config(
|
||||||
).compilation_config.inductor_compile_config
|
).compilation_config.inductor_compile_config
|
||||||
|
|
||||||
compiled_callable = torch.compile(
|
compiled_callable = torch.compile(self.forward,
|
||||||
self.forward,
|
fullgraph=True,
|
||||||
fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
|
backend=backend,
|
||||||
backend=backend,
|
options=options)
|
||||||
options=options)
|
|
||||||
|
|
||||||
self.compiled_callable = compiled_callable
|
self.compiled_callable = compiled_callable
|
||||||
self.original_code_object = self.__class__.forward.__code__
|
self.original_code_object = self.__class__.forward.__code__
|
||||||
|
|||||||
@ -434,11 +434,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_FLASH_ATTN_VERSION":
|
"VLLM_FLASH_ATTN_VERSION":
|
||||||
lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)),
|
lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)),
|
||||||
|
|
||||||
# Internal flag to enable Dynamo fullgraph capture
|
|
||||||
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE":
|
|
||||||
lambda: bool(
|
|
||||||
os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
|
|
||||||
|
|
||||||
# Feature flag to enable/disable Inductor standalone compile.
|
# Feature flag to enable/disable Inductor standalone compile.
|
||||||
# In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
|
# In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
|
||||||
# enabled by default.
|
# enabled by default.
|
||||||
|
|||||||
@ -2602,9 +2602,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
backend = self.vllm_config.compilation_config.init_backend(
|
backend = self.vllm_config.compilation_config.init_backend(
|
||||||
self.vllm_config)
|
self.vllm_config)
|
||||||
compilation_counter.dynamo_as_is_count += 1
|
compilation_counter.dynamo_as_is_count += 1
|
||||||
self.model.compile(
|
self.model.compile(fullgraph=True, backend=backend)
|
||||||
fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
|
|
||||||
backend=backend)
|
|
||||||
return
|
return
|
||||||
# for other compilation levels, cudagraph behavior is controlled by
|
# for other compilation levels, cudagraph behavior is controlled by
|
||||||
# CudagraphWraper and CudagraphDispatcher of vllm.
|
# CudagraphWraper and CudagraphDispatcher of vllm.
|
||||||
|
|||||||
@ -18,7 +18,6 @@ import torch.distributed
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
import vllm.envs as envs
|
|
||||||
from vllm.attention import AttentionMetadata, get_attn_backend
|
from vllm.attention import AttentionMetadata, get_attn_backend
|
||||||
from vllm.attention.backends.abstract import AttentionState
|
from vllm.attention.backends.abstract import AttentionState
|
||||||
from vllm.attention.backends.utils import CommonAttentionState
|
from vllm.attention.backends.utils import CommonAttentionState
|
||||||
@ -1099,10 +1098,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
|||||||
backend = self.vllm_config.compilation_config.init_backend(
|
backend = self.vllm_config.compilation_config.init_backend(
|
||||||
self.vllm_config)
|
self.vllm_config)
|
||||||
compilation_counter.dynamo_as_is_count += 1
|
compilation_counter.dynamo_as_is_count += 1
|
||||||
self.model = torch.compile(
|
self.model = torch.compile(self.model,
|
||||||
self.model,
|
fullgraph=True,
|
||||||
fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
|
backend=backend)
|
||||||
backend=backend)
|
|
||||||
|
|
||||||
def get_model(self) -> nn.Module:
|
def get_model(self) -> nn.Module:
|
||||||
return self.model
|
return self.model
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user