diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 7927aef19e4e..7e0f720feaa7 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -76,7 +76,7 @@ function cpu_tests() { # Run AWQ test # docker exec cpu-test-"$NUMA_NODE" bash -c " # set -e - # VLLM_USE_V1=0 pytest -x -s -v \ + # pytest -x -s -v \ # tests/quantization/test_ipex_quant.py" # Run multi-lora tests diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py index d5b1b4ad29a9..6a533eb5c937 100644 --- a/examples/offline_inference/mlpspeculator.py +++ b/examples/offline_inference/mlpspeculator.py @@ -4,8 +4,7 @@ This file demonstrates the usage of text generation with an LLM model, comparing the performance with and without speculative decoding. -Note that still not support `v1`: -VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py +Note that this example is out of date and not supported in vLLM v1. """ import gc diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md index 16d44cbadbc9..d8fb50d7fe55 100644 --- a/examples/offline_inference/qwen2_5_omni/README.md +++ b/examples/offline_inference/qwen2_5_omni/README.md @@ -11,12 +11,10 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \ # Read vision and audio inputs from a single video file # NOTE: V1 engine does not support interleaved modalities yet. -VLLM_USE_V1=0 \ python examples/offline_inference/qwen2_5_omni/only_thinker.py \ -q use_audio_in_video # Multiple audios -VLLM_USE_V1=0 \ python examples/offline_inference/qwen2_5_omni/only_thinker.py \ -q multi_audios ``` diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py index 6fbe1303f431..ed005e6a69b8 100644 --- a/examples/offline_inference/qwen2_5_omni/only_thinker.py +++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py @@ -7,7 +7,6 @@ with the correct prompt format on Qwen2.5-Omni (thinker only). from typing import NamedTuple -import vllm.envs as envs from vllm import LLM, SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset @@ -72,11 +71,7 @@ def get_use_audio_in_video_query() -> QueryResult: ) asset = VideoAsset(name="baby_reading", num_frames=16) audio = asset.get_audio(sampling_rate=16000) - assert not envs.VLLM_USE_V1, ( - "V1 does not support use_audio_in_video. " - "Please launch this example with " - "`VLLM_USE_V1=0`." - ) + return QueryResult( inputs={ "prompt": prompt, diff --git a/examples/others/lmcache/cpu_offload_lmcache.py b/examples/others/lmcache/cpu_offload_lmcache.py index e10ee4e2a9a9..53036b3eb0ff 100644 --- a/examples/others/lmcache/cpu_offload_lmcache.py +++ b/examples/others/lmcache/cpu_offload_lmcache.py @@ -37,7 +37,7 @@ from vllm.config import KVTransferConfig from vllm.engine.arg_utils import EngineArgs -def setup_environment_variables(vllm_version: str): +def setup_environment_variables(): # LMCache-related environment variables # Use experimental features in LMCache os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True" @@ -47,12 +47,10 @@ def setup_environment_variables(vllm_version: str): os.environ["LMCACHE_LOCAL_CPU"] = "True" # Set local CPU memory limit to 5.0 GB os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0" - if vllm_version == "v0": - os.environ["VLLM_USE_V1"] = "0" @contextlib.contextmanager -def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str): +def build_llm_with_lmcache(lmcache_connector: str, model: str): ktc = KVTransferConfig( kv_connector=lmcache_connector, kv_role="kv_both", @@ -60,21 +58,12 @@ def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # memory. Reduce the value if your GPU has less memory. # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392). - if vllm_version == "v0": - llm_args = EngineArgs( - model=model, - kv_transfer_config=ktc, - max_model_len=8000, - gpu_memory_utilization=0.8, - enable_chunked_prefill=True, # Only in v0 - ) - else: - llm_args = EngineArgs( - model=model, - kv_transfer_config=ktc, - max_model_len=8000, - gpu_memory_utilization=0.8, - ) + llm_args = EngineArgs( + model=model, + kv_transfer_config=ktc, + max_model_len=8000, + gpu_memory_utilization=0.8, + ) llm = LLM(**asdict(llm_args)) try: @@ -116,18 +105,10 @@ def parse_args(): def main(): - args = parse_args() - - if args.version == "v0": - lmcache_connector = "LMCacheConnector" - model = "mistralai/Mistral-7B-Instruct-v0.2" - else: - lmcache_connector = "LMCacheConnectorV1" - model = "meta-llama/Meta-Llama-3.1-8B-Instruct" - - setup_environment_variables(args.version) - - with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm: + lmcache_connector = "LMCacheConnectorV1" + model = "meta-llama/Meta-Llama-3.1-8B-Instruct" + setup_environment_variables() + with build_llm_with_lmcache(lmcache_connector, model) as llm: # This example script runs two requests with a shared prefix. # Define the shared prompt and specific prompts shared_prompt = "Hello, how are you?" * 1000 diff --git a/tests/entrypoints/openai/test_orca_metrics.py b/tests/entrypoints/openai/test_orca_metrics.py index d32cfde07c21..1ed44a33bf81 100644 --- a/tests/entrypoints/openai/test_orca_metrics.py +++ b/tests/entrypoints/openai/test_orca_metrics.py @@ -22,9 +22,6 @@ def monkeypatch_module(): @pytest.fixture(scope="module", params=[True]) def server(request, monkeypatch_module): - use_v1 = request.param - monkeypatch_module.setenv("VLLM_USE_V1", "1" if use_v1 else "0") - args = [ "--dtype", "bfloat16", diff --git a/vllm/envs.py b/vllm/envs.py index 46725efac70e..2aa6afcabf28 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -100,7 +100,6 @@ if TYPE_CHECKING: VLLM_SKIP_P2P_CHECK: bool = False VLLM_DISABLED_KERNELS: list[str] = [] VLLM_DISABLE_PYNCCL: bool = False - VLLM_USE_V1: bool = True VLLM_ROCM_USE_AITER: bool = False VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False VLLM_ROCM_USE_AITER_LINEAR: bool = True @@ -884,8 +883,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_DISABLE_PYNCCL": lambda: ( os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1") ), - # If set, use the V1 code path. - "VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))), # Disable aiter ops unless specifically enabled. # Acts as a parent switch to enable the rest of the other operations. "VLLM_ROCM_USE_AITER": lambda: ( @@ -1538,16 +1535,6 @@ def is_set(name: str): raise AttributeError(f"module {__name__!r} has no attribute {name!r}") -def set_vllm_use_v1(use_v1: bool): - if is_set("VLLM_USE_V1"): - raise ValueError( - "Should not call set_vllm_use_v1() if VLLM_USE_V1 is set " - "explicitly by the user. Please raise this as a Github " - "Issue and explicitly set VLLM_USE_V1=0 or 1." - ) - os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0" - - def compute_hash() -> str: """ WARNING: Whenever a new key is added to this environment diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index c8bff8b7c80b..4eddaf56d81a 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -42,7 +42,6 @@ _USAGE_ENV_VARS_TO_COLLECT = [ "VLLM_USE_FLASHINFER_SAMPLER", "VLLM_PP_LAYER_PARTITION", "VLLM_USE_TRITON_AWQ", - "VLLM_USE_V1", "VLLM_ENABLE_V1_MULTIPROCESSING", ]