[[V0 deprecation]]Remove VLLM_USE_V1 env (#28204)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan 2025-11-12 09:22:16 +08:00 committed by GitHub
parent 3f770f4427
commit e1710393c4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 15 additions and 59 deletions

View File

@ -76,7 +76,7 @@ function cpu_tests() {
# Run AWQ test # Run AWQ test
# docker exec cpu-test-"$NUMA_NODE" bash -c " # docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e # set -e
# VLLM_USE_V1=0 pytest -x -s -v \ # pytest -x -s -v \
# tests/quantization/test_ipex_quant.py" # tests/quantization/test_ipex_quant.py"
# Run multi-lora tests # Run multi-lora tests

View File

@ -4,8 +4,7 @@
This file demonstrates the usage of text generation with an LLM model, This file demonstrates the usage of text generation with an LLM model,
comparing the performance with and without speculative decoding. comparing the performance with and without speculative decoding.
Note that still not support `v1`: Note that this example is out of date and not supported in vLLM v1.
VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
""" """
import gc import gc

View File

@ -11,12 +11,10 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \
# Read vision and audio inputs from a single video file # Read vision and audio inputs from a single video file
# NOTE: V1 engine does not support interleaved modalities yet. # NOTE: V1 engine does not support interleaved modalities yet.
VLLM_USE_V1=0 \
python examples/offline_inference/qwen2_5_omni/only_thinker.py \ python examples/offline_inference/qwen2_5_omni/only_thinker.py \
-q use_audio_in_video -q use_audio_in_video
# Multiple audios # Multiple audios
VLLM_USE_V1=0 \
python examples/offline_inference/qwen2_5_omni/only_thinker.py \ python examples/offline_inference/qwen2_5_omni/only_thinker.py \
-q multi_audios -q multi_audios
``` ```

View File

@ -7,7 +7,6 @@ with the correct prompt format on Qwen2.5-Omni (thinker only).
from typing import NamedTuple from typing import NamedTuple
import vllm.envs as envs
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
@ -72,11 +71,7 @@ def get_use_audio_in_video_query() -> QueryResult:
) )
asset = VideoAsset(name="baby_reading", num_frames=16) asset = VideoAsset(name="baby_reading", num_frames=16)
audio = asset.get_audio(sampling_rate=16000) audio = asset.get_audio(sampling_rate=16000)
assert not envs.VLLM_USE_V1, (
"V1 does not support use_audio_in_video. "
"Please launch this example with "
"`VLLM_USE_V1=0`."
)
return QueryResult( return QueryResult(
inputs={ inputs={
"prompt": prompt, "prompt": prompt,

View File

@ -37,7 +37,7 @@ from vllm.config import KVTransferConfig
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
def setup_environment_variables(vllm_version: str): def setup_environment_variables():
# LMCache-related environment variables # LMCache-related environment variables
# Use experimental features in LMCache # Use experimental features in LMCache
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True" os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
@ -47,12 +47,10 @@ def setup_environment_variables(vllm_version: str):
os.environ["LMCACHE_LOCAL_CPU"] = "True" os.environ["LMCACHE_LOCAL_CPU"] = "True"
# Set local CPU memory limit to 5.0 GB # Set local CPU memory limit to 5.0 GB
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0" os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
if vllm_version == "v0":
os.environ["VLLM_USE_V1"] = "0"
@contextlib.contextmanager @contextlib.contextmanager
def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str): def build_llm_with_lmcache(lmcache_connector: str, model: str):
ktc = KVTransferConfig( ktc = KVTransferConfig(
kv_connector=lmcache_connector, kv_connector=lmcache_connector,
kv_role="kv_both", kv_role="kv_both",
@ -60,21 +58,12 @@ def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory. # memory. Reduce the value if your GPU has less memory.
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392). # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
if vllm_version == "v0": llm_args = EngineArgs(
llm_args = EngineArgs( model=model,
model=model, kv_transfer_config=ktc,
kv_transfer_config=ktc, max_model_len=8000,
max_model_len=8000, gpu_memory_utilization=0.8,
gpu_memory_utilization=0.8, )
enable_chunked_prefill=True, # Only in v0
)
else:
llm_args = EngineArgs(
model=model,
kv_transfer_config=ktc,
max_model_len=8000,
gpu_memory_utilization=0.8,
)
llm = LLM(**asdict(llm_args)) llm = LLM(**asdict(llm_args))
try: try:
@ -116,18 +105,10 @@ def parse_args():
def main(): def main():
args = parse_args() lmcache_connector = "LMCacheConnectorV1"
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
if args.version == "v0": setup_environment_variables()
lmcache_connector = "LMCacheConnector" with build_llm_with_lmcache(lmcache_connector, model) as llm:
model = "mistralai/Mistral-7B-Instruct-v0.2"
else:
lmcache_connector = "LMCacheConnectorV1"
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
setup_environment_variables(args.version)
with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
# This example script runs two requests with a shared prefix. # This example script runs two requests with a shared prefix.
# Define the shared prompt and specific prompts # Define the shared prompt and specific prompts
shared_prompt = "Hello, how are you?" * 1000 shared_prompt = "Hello, how are you?" * 1000

View File

@ -22,9 +22,6 @@ def monkeypatch_module():
@pytest.fixture(scope="module", params=[True]) @pytest.fixture(scope="module", params=[True])
def server(request, monkeypatch_module): def server(request, monkeypatch_module):
use_v1 = request.param
monkeypatch_module.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
args = [ args = [
"--dtype", "--dtype",
"bfloat16", "bfloat16",

View File

@ -100,7 +100,6 @@ if TYPE_CHECKING:
VLLM_SKIP_P2P_CHECK: bool = False VLLM_SKIP_P2P_CHECK: bool = False
VLLM_DISABLED_KERNELS: list[str] = [] VLLM_DISABLED_KERNELS: list[str] = []
VLLM_DISABLE_PYNCCL: bool = False VLLM_DISABLE_PYNCCL: bool = False
VLLM_USE_V1: bool = True
VLLM_ROCM_USE_AITER: bool = False VLLM_ROCM_USE_AITER: bool = False
VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
VLLM_ROCM_USE_AITER_LINEAR: bool = True VLLM_ROCM_USE_AITER_LINEAR: bool = True
@ -884,8 +883,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_DISABLE_PYNCCL": lambda: ( "VLLM_DISABLE_PYNCCL": lambda: (
os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1") os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
), ),
# If set, use the V1 code path.
"VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))),
# Disable aiter ops unless specifically enabled. # Disable aiter ops unless specifically enabled.
# Acts as a parent switch to enable the rest of the other operations. # Acts as a parent switch to enable the rest of the other operations.
"VLLM_ROCM_USE_AITER": lambda: ( "VLLM_ROCM_USE_AITER": lambda: (
@ -1538,16 +1535,6 @@ def is_set(name: str):
raise AttributeError(f"module {__name__!r} has no attribute {name!r}") raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
def set_vllm_use_v1(use_v1: bool):
if is_set("VLLM_USE_V1"):
raise ValueError(
"Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
"explicitly by the user. Please raise this as a Github "
"Issue and explicitly set VLLM_USE_V1=0 or 1."
)
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
def compute_hash() -> str: def compute_hash() -> str:
""" """
WARNING: Whenever a new key is added to this environment WARNING: Whenever a new key is added to this environment

View File

@ -42,7 +42,6 @@ _USAGE_ENV_VARS_TO_COLLECT = [
"VLLM_USE_FLASHINFER_SAMPLER", "VLLM_USE_FLASHINFER_SAMPLER",
"VLLM_PP_LAYER_PARTITION", "VLLM_PP_LAYER_PARTITION",
"VLLM_USE_TRITON_AWQ", "VLLM_USE_TRITON_AWQ",
"VLLM_USE_V1",
"VLLM_ENABLE_V1_MULTIPROCESSING", "VLLM_ENABLE_V1_MULTIPROCESSING",
] ]