[V0 Deprecation] Remove LLMEngine (#25033)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-12-09 03:24:56 +08:00 · 2025-09-20 17:56:30 -07:00 · 2025-09-20 17:56:30 -07:00 · 52c2a8d4ad
commit 52c2a8d4ad
parent 367a480bd3
29 changed files with 65 additions and 2763 deletions
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi

-if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
-  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
-fi
-
 if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -110,7 +110,7 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Entrypoints Integration Test (API Server) # 100min
  timeout_in_minutes: 130
@ -163,7 +163,6 @@ steps:
  - tests/v1/engine/test_engine_core_client.py
  commands:
  # test with tp=2 and external_dp=2
-  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with tp=2 and pp=2
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@ -314,12 +313,11 @@ steps:
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_pooling.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
-    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

 - label: Platform Tests (CUDA) # 4min
  timeout_in_minutes: 15
@ -894,7 +892,7 @@ steps:
  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
  # TODO: investigate and fix
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s models/multimodal/generation/test_maverick.py

--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -5,7 +5,6 @@
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@ -1,510 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import inspect
-import json
-import os
-import sys
-from argparse import RawTextHelpFormatter
-from collections.abc import Generator
-from dataclasses import asdict, dataclass
-from typing import Any, Optional, TypeAlias
-
-import torch
-import tqdm
-
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.profiler.layerwise_profile import layerwise_profile
-from vllm.utils import FlexibleArgumentParser
-
-BATCH_SIZE_DEFAULT = 1
-PROMPT_LEN_DEFAULT = 256
-
-
-@dataclass
-class ProfileContext:
-    engine_args: EngineArgs
-    prompt_len: int
-    batch_size: int
-
-    # The profiler can run in 2 modes,
-    # 1. Run profiler for user specified num_steps
-    num_steps: Optional[int] = None
-    # 2. Run profiler until all requests complete
-    complete_num_requests_per_step: Optional[int] = None
-
-    save_chrome_traces_folder: Optional[str] = None
-
-
-def get_dtype(dtype: str):
-    if dtype == "torch.float":
-        return torch.float
-    else:
-        return dtype
-
-
-OutputLen_NumReqs_Map: TypeAlias = dict[int, int]
-
-
-def compute_request_output_lengths(
-    batch_size: int, step_requests: list[int]
-) -> OutputLen_NumReqs_Map:
-    """
-    Given the number of requests, batch_size, and the number of requests
-    that each engine-step should process, step_requests, determine the
-    output lengths of the requests such that step_request is honoured.
-
-    Example:
-    if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
-    then return,
-    {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
-    32 requests should have output length 2,
-    32 requests should have output length 3,
-    32 requests should have output length 4,
-    31 requests should have output length 5,
-    1 request should have output length 6.
-
-    Args:
-        batch_size (int): Number of requests submitted for profile. This is
-            args.batch_size.
-        step_requests (list[int]): step_requests[i] is the number of requests
-            that the ith engine step should process.
-
-    Returns:
-        OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
-            number of requests required to have that output-length as values.
-    """
-    ol_nr: OutputLen_NumReqs_Map = {}
-
-    # Number of request that are assigned an output-length
-    num_reqs_assigned: int = 0
-    num_steps: int = len(step_requests)
-
-    # sanity check. The first step (prefill-step), must process all requests.
-    assert step_requests[0] == batch_size
-
-    # Begin assignments from the last step.
-    output_length: int = num_steps
-    for num_requests_at_step in reversed(step_requests):
-        if num_reqs_assigned == batch_size:
-            break
-
-        assert num_reqs_assigned < batch_size
-
-        # Remove the number of requests that have been determined
-        # to participate in this step and beyond.
-        num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
-        assert num_reqs_unassigned_at_step >= 0
-
-        if num_reqs_unassigned_at_step > 0:
-            ol_nr[output_length] = num_reqs_unassigned_at_step
-            num_reqs_assigned += num_reqs_unassigned_at_step
-
-        output_length -= 1
-
-    # sanity checks.
-    assert sum(ol_nr.values()) == batch_size, (
-        "Number of requests in output-length assignment does not match "
-        f"batch-size.\n batch size {batch_size} - "
-        f"step requests {step_requests} - assignments {ol_nr}"
-    )
-
-    # Check that the output-length is in [1, num-steps]. Output length must be
-    # at least 1 as all requests must participate in the prefill-step.
-    assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), (
-        "Output lengths of requests should be in range "
-        f"[1, num-engine-steps].\n batch size {batch_size} - "
-        f"step requests {step_requests} - assignments {ol_nr}"
-    )
-
-    return ol_nr
-
-
-def determine_requests_per_step(context: ProfileContext) -> list[int]:
-    """
-    Determine number of requests each engine step should process.
-    If context.num_steps is set, then all engine steps process the
-    same number of requests and the output list is of length
-    context.num_steps.
-
-    If context.complete_num_requests_per_step is set, then each decode step
-    processes fewer and fewer requests until there are no requests to process.
-    In this case, the output list is as big as the number of steps
-    required to process all requests.
-
-    Args:
-        context: ProfileContext object.
-
-    Returns:
-        list[int]: Number of requests to process for all engine-steps.
-         output[i], contains the number of requests that the ith step
-         should process.
-    """
-    if context.num_steps:
-        # All requests must run until num_engine_steps. This implies
-        # that their output lengths must be equal to num_engine_steps.
-        return [context.batch_size] * context.num_steps
-
-    assert (
-        context.complete_num_requests_per_step
-        and context.complete_num_requests_per_step > 0
-    ), (
-        f"Expected a positive complete_num_requests_per_step argument."
-        f"Instead got {context.complete_num_requests_per_step}"
-    )
-
-    # We start dropping after the first decode step.
-    step_requests = [
-        context.batch_size,  # prefill
-        context.batch_size,  # decode
-    ]
-
-    num_running_requests = context.batch_size
-    num_running_requests -= context.complete_num_requests_per_step
-    while num_running_requests > 0:
-        step_requests.append(num_running_requests)
-        num_running_requests -= context.complete_num_requests_per_step
-
-    if step_requests[-1] != 1:
-        # have 1 request running at the last step. This is often
-        # useful
-        step_requests.append(1)
-
-    return step_requests
-
-
-def run_profile(
-    context: ProfileContext, csv_output: Optional[str], json_output: Optional[str]
-):
-    print("Run profile with:")
-    for key, value in asdict(context).items():
-        print(f"  {key} = {value}")
-
-    requests_per_step: list[int] = determine_requests_per_step(context)
-
-    ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
-        context.batch_size, requests_per_step
-    )
-
-    num_steps_to_profile: int = len(requests_per_step)
-    max_output_len: int = max(ol_nr.keys())
-    assert max_output_len >= 1
-
-    # Create sampling params
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        # max_tokens is set on a per-request basis.
-        max_tokens=None,
-        ignore_eos=True,
-    )
-
-    # Create LLM
-    llm = LLM(**asdict(context.engine_args))
-    batch_size = context.batch_size
-    prompt_len = context.prompt_len
-
-    scheduler_config = llm.llm_engine.vllm_config.scheduler_config
-    max_model_len = llm.llm_engine.model_config.max_model_len
-    max_num_batched_tokens = scheduler_config.max_num_batched_tokens
-    max_num_seqs = scheduler_config.max_num_seqs
-
-    if batch_size * prompt_len > max_num_batched_tokens:
-        print(
-            f"ERROR: chosen batch_size * prompt_len "
-            f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is  "
-            f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
-            f"and therefore cannot be run in a single profile step, please "
-            f"choose a smaller batch size or prompt length, or increase "
-            f"--max-num-batched-tokens"
-        )
-        sys.exit(-1)
-    if batch_size > max_num_seqs:
-        print(
-            f"ERROR: chosen batch_size ({batch_size}) is larger than "
-            f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
-            f"single profile step, please choose a smaller batch size"
-        )
-        sys.exit(-1)
-    print(
-        "llm.llm_engine.model_config.max_model_len: ",
-        llm.llm_engine.model_config.max_model_len,
-    )
-    if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
-        print(
-            f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
-            f"{max_output_len} = {prompt_len + max_output_len}) is larger "
-            f"than the model's max_model_len ({max_model_len}), please "
-            f"choose a smaller prompt_len or max_output_len, or increase "
-            f"--max-model-len"
-        )
-        sys.exit(-1)
-
-    def add_requests():
-        def get_output_len_generator() -> Generator[int, Any, Any]:
-            for output_len, num_reqs in ol_nr.items():
-                for _ in range(num_reqs):
-                    yield output_len
-
-        output_len_generator = get_output_len_generator()
-        for i in range(batch_size):
-            sampling_params.max_tokens = next(output_len_generator)
-            assert isinstance(sampling_params.max_tokens, int)
-
-            prompt_token_ids = torch.randint(
-                llm.get_tokenizer().vocab_size, size=(prompt_len,)
-            ).tolist()
-
-            llm.llm_engine.add_request(
-                request_id=f"seq{i}",
-                prompt={"prompt_token_ids": prompt_token_ids},
-                params=sampling_params,
-            )
-
-    def abort_requests():
-        for i in range(batch_size):
-            llm.llm_engine.abort_request(f"seq{i}")
-
-    # Warm up run
-    print("Warm up run ...")
-    add_requests()
-    llm.llm_engine.step()  # Prefill
-    llm.llm_engine.step()  # Decode
-    abort_requests()
-
-    print("Profile run ...")
-    add_requests()
-
-    with layerwise_profile() as prefill_prof:
-        llm.llm_engine.step()  # First step is prefill
-
-    decode_profs = []
-    for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
-        num_running_seqs = llm.llm_engine.scheduler[0].get_num_unfinished_seq_groups()
-        with layerwise_profile(num_running_seqs=num_running_seqs) as decode_prof:
-            llm.llm_engine.step()
-        decode_profs.append(decode_prof)
-
-    decode_results_list = [prof.results for prof in decode_profs]
-    prefill_results = prefill_prof.results
-    has_decode = len(decode_results_list) > 0
-
-    LINE_WIDTH = 80
-    print("=" * LINE_WIDTH)
-    print(f"= Prefill Model Table (prompt_len={prompt_len}, batch_size={batch_size})")
-    print("=" * LINE_WIDTH)
-    print()
-    prefill_results.print_model_table()
-
-    if has_decode:
-        print()
-        print("=" * LINE_WIDTH)
-        print(
-            f"= First Decode Step Model Table "
-            f"(prompt_len={prompt_len}, batch_size={batch_size})"
-        )
-        print("=" * LINE_WIDTH)
-        print()
-        decode_results_list[0].print_model_table()
-
-    print()
-    print("=" * LINE_WIDTH)
-    print(f"= Prefill Summary Table (prompt_len={prompt_len}, batch_size={batch_size})")
-    print("=" * LINE_WIDTH)
-    print()
-    prefill_results.print_summary_table()
-
-    if has_decode:
-        print()
-        print("=" * LINE_WIDTH)
-        print(
-            f"= First Decode Step Summary Table "
-            f"(prompt_len={prompt_len}, batch_size={batch_size})"
-        )
-        print("=" * LINE_WIDTH)
-        print()
-        decode_results_list[0].print_summary_table()
-
-    if csv_output:
-        csv_filename_base = (
-            csv_output[:-4] if csv_output.endswith(".csv") else csv_output
-        )
-        prefill_results.export_model_stats_table_csv(
-            csv_filename_base + "_prefill_model_table.csv"
-        )
-        prefill_results.export_summary_stats_table_csv(
-            csv_filename_base + "_prefill_summary_table.csv"
-        )
-
-        if has_decode:
-            decode_results_list[0].export_model_stats_table_csv(
-                csv_filename_base + "_decode_model_table.csv"
-            )
-            decode_results_list[0].export_summary_stats_table_csv(
-                csv_filename_base + "_decode_summary_table.csv"
-            )
-
-    if json_output:
-        cuda_devices = [
-            torch.cuda.get_device_properties(dev_idx)
-            for dev_idx in range(torch.cuda.device_count())
-        ]
-
-        json_dict = {
-            "context": {
-                "python_version": f"{sys.version}",
-                "torch_version": f"{torch.__version__}",
-                "torch_cuda_version": f"{torch.version.cuda}",
-                "cuda_devices": f"{cuda_devices}",
-                **asdict(context),
-            },
-            "prefill": prefill_results.convert_stats_to_dict(),
-        }
-
-        if has_decode:
-            for idx, dr in enumerate(decode_results_list):
-                json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
-
-        # Add .json to json_output filename if it doesn't exist already.
-        json_output_file = (
-            json_output if json_output.endswith(".json") else json_output + ".json"
-        )
-        with open(json_output_file, "w+") as f:
-            json.dump(json_dict, f, indent=2)
-        pass
-
-    if context.save_chrome_traces_folder is not None:
-        os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
-        prefill_prof.profiler.export_chrome_trace(
-            context.save_chrome_traces_folder + "/prefill.json"
-        )
-        for idx, decode_prof in enumerate(decode_profs):
-            decode_prof.profiler.export_chrome_trace(
-                context.save_chrome_traces_folder + f"/decode_{idx + 1}.json"
-            )
-        print(
-            "Traces saved as prefill.json and decode_1.json, etc."
-            f" in folder {context.save_chrome_traces_folder}"
-        )
-
-
-def parse_args():
-    parser = FlexibleArgumentParser(
-        description="""
-Profile a model
-
-    example:
-    ```
-    python examples/offline_inference/profiling.py \\
-        --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
-        --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
-        --enforce-eager run_num_steps -n 2
-    ```
-
-    then you can use various tools to analyze the json output
-    terminal ascii tables:
-        ```
-        python tools/profiler/print_layerwise_table.py \\
-            --json-trace Llama31-8b-FP8.json --phase prefill --table summary
-        ```
-    or create matplotlib stacked bar charts:
-        ```
-        python tools/profiler/visualize_layerwise_profile.py \\
-            --json-trace Llama31-8b-FP8.json \\
-            --output-directory profile_breakdown --plot-metric pct_cuda_time
-        ```
-""",
-        formatter_class=RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--csv",
-        type=str,
-        default=None,
-        help="Export the results as multiple csv file. This should be the root "
-        "filename, will create <filename>_prefill_model_table.csv, "
-        "<filename>_prefill_summary_table.csv, "
-        "<filename>_decode_model_table.csv, and "
-        "<filename>_decode_summary_table.csv",
-    )
-    parser.add_argument(
-        "--json",
-        type=str,
-        default=None,
-        help="Export the results as a json file. This should be the filename",
-    )
-    parser.add_argument(
-        "--save-chrome-traces-folder",
-        type=str,
-        help="Save chrome traces for the prefill and decode "
-        "will save traces as prefill.json and decode_1.json, "
-        "etc. inside this folder",
-    )
-    parser.add_argument(
-        "--prompt-len",
-        type=int,
-        default=PROMPT_LEN_DEFAULT,
-        help=f"Length of the random prompt to use when profiling, all batched "
-        f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}",
-    )
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=BATCH_SIZE_DEFAULT,
-        help=f"Number of requests to run as a single batch, "
-        f"default={BATCH_SIZE_DEFAULT}",
-    )
-
-    subparsers = parser.add_subparsers(dest="cmd")
-
-    run_num_steps_parser = subparsers.add_parser(
-        "run_num_steps", help="This variation profiles n engine.step() invocations."
-    )
-    run_num_steps_parser.add_argument(
-        "-n",
-        "--num-steps",
-        type=int,
-        help="Number of engine steps to profile.\n"
-        "Setting it to 1, profiles only the prefill step.\n"
-        "Setting it to 2, profiles the prefill and first decode step\n"
-        "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
-        "and so on ...",
-    )
-
-    run_to_completion_parser = subparsers.add_parser(
-        "run_to_completion",
-        help="This variation profiles all the engine.step() invocations"
-        "until the engine exhausts all submitted requests.",
-    )
-    run_to_completion_parser.add_argument(
-        "-n",
-        "--complete-num-requests-per-step",
-        type=int,
-        help="Complete complete_num_requests_per_step requests every decode step."
-        "For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
-        "the profiler is run for 6 engine steps, with the steps processing, "
-        "128, 128, 96, 64, 32, 1 requests respectively.\n"
-        "Note that we tack-on a one-request step at the end as it is often "
-        "useful.",
-    )
-
-    EngineArgs.add_cli_args(parser)
-
-    return parser.parse_args()
-
-
-def main(args):
-    context = ProfileContext(
-        engine_args=EngineArgs.from_cli_args(args),
-        **{
-            k: v
-            for k, v in vars(args).items()
-            if k in inspect.signature(ProfileContext).parameters
-        },
-    )
-    run_profile(context, csv_output=args.csv, json_output=args.json)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -11,7 +11,7 @@ from unittest.mock import Mock
 import pytest
 import torch

-from vllm import LLM, envs
+from vllm import LLM
 from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1

 from ..conftest import HfRunner, VllmRunner
@ -26,14 +26,6 @@ MODELS = [
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
    llm = LLM("distilbert/distilgpt2")
@ -76,12 +68,6 @@ def test_models(
    model_executor: str,
    enable_prompt_embeds: bool,
 ) -> None:
-    if not envs.VLLM_USE_V1:
-        if async_scheduling:
-            pytest.skip("async_scheduling only supported in v1.")
-        if model_executor != "uni":
-            pytest.skip("only test uniproc executor for v0.")
-
    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
        pytest.skip(
            f"{backend} does not support gemma2 with full context length.")
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@ -122,11 +122,12 @@ def test_cumem_with_cudagraph():
        # sleep mode with safetensors
        ("meta-llama/Llama-3.2-1B", True),
        # sleep mode with pytorch checkpoint
-        ("facebook/opt-125m", False),
+        ("facebook/opt-125m", True),
    ])
 def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        assert use_v1
+        m.setenv("VLLM_USE_V1", "1")
        free, total = torch.cuda.mem_get_info()
        used_bytes_baseline = total - free  # in case other process is running
        llm = LLM(model, enable_sleep_mode=True)
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@ -54,8 +54,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
    # Use global backends
    global backend, backend_unfused

-    use_v1 = False  # can be made a param once V1 support added
-    monkeypatch.setenv("VLLM_USE_V1", str(int(use_v1)))
+    monkeypatch.setenv("VLLM_USE_V1", "1")
    monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", str(int(use_triton_fa)))

    # Prompt 4 seems too open-ended, differs between fused and unfused
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -160,26 +160,6 @@ def cleanup_VLLM_USE_V1(monkeypatch):
        monkeypatch.delenv("VLLM_USE_V1")


-@pytest.fixture(params=[True, False])
-def run_with_both_engines(request, monkeypatch):
-    # Automatically runs tests twice, once with V1 and once without
-    use_v1 = request.param
-    # Tests decorated with `@skip_v1` are only run without v1
-    skip_v0 = request.node.get_closest_marker("skip_v0")
-    skip_v1 = request.node.get_closest_marker("skip_v1")
-
-    if use_v1:
-        if skip_v1:
-            pytest.skip("Skipping test on vllm V1")
-        monkeypatch.setenv('VLLM_USE_V1', '1')
-    else:
-        if skip_v0:
-            pytest.skip("Skipping test on vllm V0")
-        monkeypatch.setenv('VLLM_USE_V1', '0')
-
-    yield
-
-
@pytest.fixture(autouse=True)
 def init_test_http_connection():
    # pytest_asyncio may use a different event loop per test
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@ -25,12 +25,6 @@ TOKEN_IDS = [
 ]


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
@pytest.fixture(scope="module")
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@ -6,14 +6,6 @@ import pytest
 from vllm import LLM


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def test_empty_prompt():
    llm = LLM(model="openai-community/gpt2", enforce_eager=True)
    with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@ -432,7 +432,7 @@ def test_metrics_exist_run_batch(use_v1: bool):
            "--port",
            port,
        ],
-                                env={"VLLM_USE_V1": "1" if use_v1 else "0"})
+                                env={"VLLM_USE_V1": "1"})

        def is_server_up(url):
            try:
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@ -69,28 +69,20 @@ def generate_params():

@pytest.mark.parametrize("device, name, use_mla, block_size",
                         generate_params())
-@pytest.mark.parametrize("use_v1", [True, False])
 def test_env(
    device: str,
    name: str,
    use_mla: bool,
    block_size: int,
-    use_v1: bool,
    monkeypatch: pytest.MonkeyPatch,
 ):
    """Test attention backend selection with valid device-backend pairs."""
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv("VLLM_USE_V1", "1")
        m.setenv(STR_BACKEND_ENV_VAR, name)
        m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")

-        if name == "FLASHINFER" and not use_v1:
-            pytest.skip("FlashInfer backend is only available on V1 engine")
-
        if device == "cpu":
-            if not use_v1:
-                pytest.skip("CPU backend only supports V1")
-
            with patch("vllm.attention.selector.current_platform",
                       CpuPlatform()):
                backend = get_attn_backend(16, torch.float16, None, block_size,
@ -137,7 +129,7 @@ def test_env(
                                                   block_size,
                                                   False,
                                                   use_mla=use_mla)
-                        expected = f"{name}_VLLM_V1" if use_v1 else name
+                        expected = f"{name}_VLLM_V1"
                        assert backend.get_name() == expected
                else:
                    backend = get_attn_backend(16,
@ -146,7 +138,7 @@ def test_env(
                                               block_size,
                                               False,
                                               use_mla=use_mla)
-                    expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
+                    expected = "TRITON_ATTN_VLLM_V1"
                    assert backend.get_name() == expected

        elif device == "cuda":
@ -163,11 +155,7 @@ def test_env(
                    # - TRITON_MLA: fallback for other cases

                    if name == "CUTLASS_MLA":
-                        if not use_v1:
-                            # CUTLASS_MLA only supported on V1 engine
-                            pytest.skip(
-                                "CUTLASS_MLA only supported on V1 engine")
-                        elif block_size != 128:
+                        if block_size != 128:
                            # CUTLASS_MLA only supports block_size == 128
                            pytest.skip(
                                "CUTLASS_MLA only supports block_size 128")
@ -181,11 +169,7 @@ def test_env(
                            expected = "CUTLASS_MLA_VLLM_V1"
                            assert backend.get_name() == expected
                    elif name == "FLASHINFER_MLA":
-                        if not use_v1:
-                            # FlashInfer MLA only supported on V1 engine
-                            pytest.skip(
-                                "FlashInfer MLA only supported on V1 engine")
-                        elif block_size not in [32, 64]:
+                        if block_size not in [32, 64]:
                            # FlashInfer MLA only supports block_size 32 or 64
                            pytest.skip(
                                "FlashInfer MLA only supports block_size 32 "
@ -217,23 +201,17 @@ def test_env(
                                                           block_size,
                                                           False,
                                                           use_mla=use_mla)
-                                expected = f"{name}_VLLM_V1" if use_v1 else name
+                                expected = f"{name}_VLLM_V1"
                                assert backend.get_name() == expected
                    elif name == "FLASH_ATTN_MLA":
-                        if not use_v1:
-                            # FlashAttention MLA only supported on V1 engine
-                            pytest.skip(
-                                "FlashAttention MLA only supported on V1 engine"
-                            )
-                        else:
-                            backend = get_attn_backend(16,
-                                                       torch.float16,
-                                                       None,
-                                                       block_size,
-                                                       False,
-                                                       use_mla=use_mla)
-                            expected = "FLASH_ATTN_MLA"
-                            assert backend.get_name() == expected
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   None,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        expected = "FLASH_ATTN_MLA"
+                        assert backend.get_name() == expected
                    else:
                        # TRITON_MLA or other fallback
                        backend = get_attn_backend(16,
@ -242,8 +220,7 @@ def test_env(
                                                   block_size,
                                                   False,
                                                   use_mla=use_mla)
-                        expected = ("TRITON_MLA_VLLM_V1"
-                                    if use_v1 else "TRITON_MLA")
+                        expected = "TRITON_MLA_VLLM_V1"
                        assert backend.get_name() == expected
                elif name == "FLASHINFER":
                    backend = get_attn_backend(16,
@ -252,7 +229,7 @@ def test_env(
                                               block_size,
                                               False,
                                               use_mla=use_mla)
-                    expected = "FLASHINFER_VLLM_V1" if use_v1 else name
+                    expected = "FLASHINFER_VLLM_V1"
                    assert backend.get_name() == expected
                else:
                    backend = get_attn_backend(32,
@ -261,36 +238,30 @@ def test_env(
                                               block_size,
                                               False,
                                               use_mla=use_mla)
-                    expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+                    expected = "FLASH_ATTN_VLLM_V1"
                    assert backend.get_name() == expected

-                    if use_v1:
-                        backend = get_attn_backend(16,
-                                                   torch.float16,
-                                                   None,
-                                                   block_size,
-                                                   False,
-                                                   use_mla=use_mla)
-                        assert backend.get_name() == "FLEX_ATTENTION", (
-                            "Should fallback to FlexAttention if head size is "
-                            "not supported by FlashAttention")
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               None,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    assert backend.get_name() == "FLEX_ATTENTION", (
+                        "Should fallback to FlexAttention if head size is "
+                        "not supported by FlashAttention")


@pytest.mark.parametrize("device", ["cpu", "cuda"])
-@pytest.mark.parametrize("use_v1", [True, False])
 def test_fp32_fallback(
    device: str,
-    use_v1: bool,
    monkeypatch: pytest.MonkeyPatch,
 ):
    """Test attention backend selection with fp32."""
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv("VLLM_USE_V1", "1")

        if device == "cpu":
-            if not use_v1:
-                pytest.skip("CPU backend only supports V1")
-
            with patch("vllm.attention.selector.current_platform",
                       CpuPlatform()):
                backend = get_attn_backend(16, torch.float32, None, 16, False)
@ -300,8 +271,7 @@ def test_fp32_fallback(
            with patch("vllm.attention.selector.current_platform",
                       CudaPlatform()):
                backend = get_attn_backend(16, torch.float32, None, 16, False)
-            assert (backend.get_name() == "FLEX_ATTENTION"
-                    if use_v1 else "XFORMERS")
+            assert backend.get_name() == "FLEX_ATTENTION"


 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
@ -357,12 +327,11 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
        assert backend.get_name() != STR_FLASH_ATTN_VAL


-@pytest.mark.parametrize("use_v1", [True, False])
-def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
+def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
    """Test that invalid attention backend names raise ValueError."""
    with monkeypatch.context() as m, patch(
            "vllm.attention.selector.current_platform", CudaPlatform()):
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv("VLLM_USE_V1", "1")
        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)

        # Should raise ValueError for invalid backend
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@ -6,10 +6,10 @@ Script to test add_lora, remove_lora, pin_lora, list_loras functions.
 import pytest

 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args)
 from vllm.lora.request import LoRARequest
+from vllm.v1.engine.llm_engine import LLMEngine

 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@ -15,7 +15,8 @@ from ...utils import check_logprobs_close
 # have a clean way to fall back, so we fail with
 # a clear msg when it happens.
 # https://github.com/vllm-project/vllm/issues/14524
-REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
+# NOTE(woosuk): Skipping these tests until V1 supports them.
+# REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]

 # This list contains the model that are using AITER kernel.
 # Skip model that are not using AITER tests.
@ -113,9 +114,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")

-    if model in REQUIRES_V0:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
    if use_rocm_aiter and (model in AITER_MODEL_LIST):
        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
    elif use_rocm_aiter and model not in AITER_MODEL_LIST:
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@ -8,7 +8,7 @@ from tests.utils import multi_gpu_test
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams

-from ...utils import check_logprobs_close, check_outputs_equal
+from ...utils import check_logprobs_close

 # Mark all tests as hybrid
 pytestmark = pytest.mark.hybrid_model
@ -88,15 +88,6 @@ def test_models(
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        if model not in V0_UNSUPPORTED_MODELS:
-            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                    example_prompts, max_tokens, num_logprobs)
-        else:
-            vllm_v0_outputs = None
-
    if model in V1_SUPPORTED_MODELS:
        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
@ -104,14 +95,6 @@ def test_models(
    else:
        vllm_v1_outputs = None

-    if vllm_v0_outputs is not None:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_v0_outputs,
-            name_0="hf",
-            name_1="vllm-v0",
-        )
-
    if model in V1_SUPPORTED_MODELS:
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
@ -157,45 +140,6 @@ def test_batching(
    )


-@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
-def test_chunked_prefill(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    max_tokens: int,
-    num_logprobs: int,
-    chunked_prefill_token_size: int,
-    monkeypatch,
-) -> None:
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        with vllm_runner(model,
-                         enable_chunked_prefill=True,
-                         max_num_batched_tokens=max_num_batched_tokens,
-                         max_num_seqs=max_num_seqs) as vllm_model:
-            chunked = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-
-        with vllm_runner(model,
-                         enable_chunked_prefill=False,
-                         max_num_seqs=max_num_seqs) as vllm_model:
-            non_chunked = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-
-        check_logprobs_close(
-            outputs_0_lst=chunked,
-            outputs_1_lst=non_chunked,
-            name_0="chunked",
-            name_1="non_chunked",
-        )
-
-
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@pytest.mark.parametrize("max_tokens", [10])
 def test_chunked_prefill_with_parallel_sampling(
@ -257,38 +201,6 @@ def test_mamba_cache_cg_padding(
            "Could be related to mamba cache not padded correctly")


-@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
-@pytest.mark.parametrize("max_tokens", [20])
-def test_models_preemption_recompute(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    max_tokens: int,
-    monkeypatch,
-) -> None:
-    """
-    Tests that outputs are identical with and w/o preemptions (recompute).
-    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-            scheduler = vllm_model.llm.llm_engine.scheduler[0]
-            scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
-            preempt_vllm_outputs = vllm_model.generate_greedy(
-                example_prompts, max_tokens)
-
-            scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=preempt_vllm_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="vllm_preepmtions",
-            name_1="vllm",
-        )
-
-
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
    vllm_runner,
@ -386,27 +298,10 @@ def test_full_cuda_graph(
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        if model not in V0_UNSUPPORTED_MODELS:
-            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                    example_prompts, max_tokens, num_logprobs)
-        else:
-            vllm_v0_outputs = None
-
    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

-    if vllm_v0_outputs is not None:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_v0_outputs,
-            name_0="hf",
-            name_1="vllm-v0",
-        )
-
    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_v1_outputs,
@ -442,27 +337,12 @@ def test_fp32_cache_state(
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        with vllm_runner(model,
-                         max_num_seqs=MAX_NUM_SEQS,
-                         **{cache_dtype_param: "float32"}) as vllm_model:
-            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-
    with vllm_runner(model,
                     max_num_seqs=MAX_NUM_SEQS,
                     **{cache_dtype_param: "float32"}) as vllm_model:
        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_v0_outputs,
-        name_0="hf",
-        name_1="vllm-v0",
-    )
-
    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_v1_outputs,
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os

 import pytest
 import torch
@ -82,7 +81,7 @@ def test_prm_models(
    check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B",
                               max_transformers_version="4.53.2")

-    if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
+    if current_platform.is_cpu():
        pytest.skip("CPU only supports V1")

    if current_platform.is_rocm():
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@ -36,9 +36,6 @@ from ..utils import check_logprobs_close
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1])
-# Due to low-precision numerical divergence, this test is too sensitive for
-# the async postprocessor
-@pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_models(
    vllm_runner,
    example_prompts,
@ -49,7 +46,6 @@ def test_models(
    enforce_eager: bool,
    backend: str,
    tensor_parallel_size: int,
-    disable_async_output_proc: bool,
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
@ -74,7 +70,6 @@ def test_models(
                tensor_parallel_size=tensor_parallel_size,
                enforce_eager=enforce_eager,
                kv_cache_dtype="auto",
-                disable_async_output_proc=disable_async_output_proc,
        ) as vllm_model:
            baseline_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, NUM_LOG_PROBS)
@ -85,7 +80,6 @@ def test_models(
                tensor_parallel_size=tensor_parallel_size,
                enforce_eager=enforce_eager,
                kv_cache_dtype=kv_cache_dtype,
-                disable_async_output_proc=disable_async_output_proc,
        ) as vllm_model:
            test_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, NUM_LOG_PROBS)
@ -110,9 +104,6 @@ def test_models(
    ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
-# Due to low-precision numerical divergence, this test is too sensitive for
-# the async postprocessor
-@pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_cpu_models(
    vllm_runner,
    example_prompts,
@ -120,7 +111,6 @@ def test_cpu_models(
    base_model: str,
    test_model: str,
    max_tokens: int,
-    disable_async_output_proc: bool,
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
@ -138,7 +128,6 @@ def test_cpu_models(
                max_model_len=MAX_MODEL_LEN,
                dtype="bfloat16",
                kv_cache_dtype="auto",
-                disable_async_output_proc=disable_async_output_proc,
        ) as vllm_model:
            baseline_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, NUM_LOG_PROBS)
@ -148,7 +137,6 @@ def test_cpu_models(
                max_model_len=MAX_MODEL_LEN,
                dtype="bfloat16",
                kv_cache_dtype=kv_cache_dtype,
-                disable_async_output_proc=disable_async_output_proc,
        ) as vllm_model:
            test_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, NUM_LOG_PROBS)
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@ -7,7 +7,6 @@ from unittest.mock import patch
 import pytest

 from vllm import LLM
-from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
 from vllm.utils import GiB_bytes
 from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
 from vllm.v1.engine.core import EngineCore as V1EngineCore
@ -61,10 +60,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
                                  False))

    # Avoid calling model.forward()
-    def _initialize_kv_caches_v0(self) -> None:
-        self.cache_config.num_gpu_blocks = 0
-        self.cache_config.num_cpu_blocks = 0
-
    def _initialize_kv_caches_v1(self, vllm_config):
        kv_cache_specs = self.model_executor.get_kv_cache_specs()
        scheduler_kv_cache_config = get_kv_cache_configs(
@ -76,12 +71,12 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
        return 1, 0, scheduler_kv_cache_config

-    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
-                       _initialize_kv_caches_v0),
-          patch.object(V1EngineCore, "_initialize_kv_caches",
+    with (patch.object(V1EngineCore, "_initialize_kv_caches",
                       _initialize_kv_caches_v1), monkeypatch.context() as m):
        if model_info.v0_only:
-            m.setenv("VLLM_USE_V1", "0")
+            # NOTE(woosuk): skip the test for V0-only models
+            return
+
        if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"):
            # Phi4FlashForCausalLM and MotifForCausalLM
            # only supports DIFFERENTIAL_FLASH_ATTN backend
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@ -42,6 +42,7 @@ def test_oot_registration_text_generation(
            assert rest == ""


+@pytest.mark.skip(reason="This test is skipped because it failed on V1.")
@create_new_process_for_each_test()
 def test_oot_registration_embedding(
    monkeypatch: pytest.MonkeyPatch,
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@ -7,15 +7,6 @@ import torch
 from vllm.plugins import load_general_plugins


-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
 def test_platform_plugins():
    # simulate workload by running an example
    import runpy
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@ -3,47 +3,18 @@

 import pytest

-from vllm.core.scheduler import Scheduler
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
-from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.engine.llm_engine import LLMEngine


-class DummyV0Scheduler(Scheduler):
-
-    def schedule(self):
-        raise Exception("Exception raised by DummyV0Scheduler")
-
-
-class DummyV1Scheduler(V1Scheduler):
+class DummyV1Scheduler(Scheduler):

    def schedule(self):
        raise Exception("Exception raised by DummyV1Scheduler")


-def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        with pytest.raises(Exception) as exception_info:
-
-            engine_args = EngineArgs(
-                model="facebook/opt-125m",
-                enforce_eager=True,  # reduce test time
-                scheduler_cls=DummyV0Scheduler,
-            )
-
-            engine = LLMEngine.from_engine_args(engine_args=engine_args)
-
-            sampling_params = SamplingParams(max_tokens=1)
-            engine.add_request("0", "foo", sampling_params)
-            engine.step()
-
-        assert str(
-            exception_info.value) == "Exception raised by DummyV0Scheduler"
-
-
 def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
@ -59,7 +30,7 @@ def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
                scheduler_cls=DummyV1Scheduler,
            )

-            engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
+            engine = LLMEngine.from_engine_args(engine_args=engine_args)

            sampling_params = SamplingParams(max_tokens=1)
            engine.add_request("0", "foo", sampling_params)
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@ -10,13 +10,6 @@ from transformers import AutoModelForSeq2SeqLM

 from vllm.assets.audio import AudioAsset

-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
 # FIXME(zhuohan): The test can not pass if we:
 #   1. Increase max_tokens to 256.
 #   2. Increase beam_width to 8.
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@ -9,13 +9,6 @@ import pytest

 from vllm import SamplingParams

-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
 # We also test with llama because it has generation_config to specify EOS
 # (past regression).
 MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"]
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@ -8,12 +8,6 @@ from vllm import SamplingParams
 MODELS = ["distilbert/distilgpt2"]


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
 def test_ranks(
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@ -352,58 +352,3 @@ def test_decode_prompt_logprobs(complete_sequence: str,
        logprobs[token_id + 1].decoded_token
        for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
    ])
-
-
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1])
-def test_decode_prompt_logprobs_chunked_prefill(
-    vllm_runner,
-    model,
-    chunked_prefill_token_size: int,
-    example_prompts,
-    monkeypatch,
-):
-    # VLLM V1 does not use incremental detokenization for
-    # prompt logprobs, so this test strategy is irrelevant.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    max_num_seqs = 256
-    enable_chunked_prefill = False
-    max_num_batched_tokens = None
-    if chunked_prefill_token_size != -1:
-        enable_chunked_prefill = True
-        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
-        max_num_batched_tokens = chunked_prefill_token_size
-
-    with vllm_runner(model,
-                     dtype="half",
-                     max_logprobs=5,
-                     gpu_memory_utilization=0.5,
-                     enable_chunked_prefill=enable_chunked_prefill,
-                     max_num_batched_tokens=max_num_batched_tokens,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-
-        vllm_sampling_params = SamplingParams(max_tokens=10,
-                                              logprobs=5,
-                                              prompt_logprobs=5,
-                                              temperature=0.0)
-        vllm_results = vllm_model.llm.generate(
-            example_prompts, sampling_params=vllm_sampling_params)
-
-        for idx, result in enumerate(vllm_results):
-            assert result.prompt_logprobs is not None
-            assert result.prompt_logprobs[0] is None
-
-            # Compared detokenized prompts ids to original prompt.
-            generated_string = ""
-            for (prompt_token,
-                 prompt_logprobs) in zip(result.prompt_token_ids[1:],
-                                         result.prompt_logprobs[1:]):
-                # prompt_logprobs is a dict of the token_id: logprob
-                # We select the token_id corresponding to the actual prompt
-                # Decoded token in the detokenized string corresponding to this
-                # prompt token.
-                generated_string += prompt_logprobs[prompt_token].decoded_token
-
-            assert generated_string == example_prompts[idx], (
-                "Detokenized prompt logprobs do not match original prompt")
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -1508,14 +1508,6 @@ class EngineArgs:
                               recommend_to_remove=True)
            return False

-        if self.kv_cache_dtype != "auto":
-            supported = current_platform.is_kv_cache_dtype_supported(
-                self.kv_cache_dtype, model_config)
-            if not supported:
-                _raise_or_fallback(feature_name="--kv-cache-dtype",
-                                   recommend_to_remove=False)
-                return False
-
        # No Mamba or Encoder-Decoder so far.
        if not model_config.is_v1_compatible:
            _raise_or_fallback(feature_name=model_config.architectures,
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -11,7 +11,6 @@ from pydantic import ValidationError
 from tqdm.auto import tqdm
 from typing_extensions import TypeVar

-import vllm.envs as envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                              BeamSearchSequence,
                              create_sort_beams_key_function)
@ -19,7 +18,6 @@ from vllm.config import (CompilationConfig, ModelDType,
                         StructuredOutputsConfig, TokenizerMode, is_init_field)
 from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
                                   PoolerConfig, RunnerOption)
-from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                         ChatTemplateContentFormatOption,
                                         apply_hf_chat_template,
@ -54,6 +52,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                               get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, Device, as_iter, is_list_of
+from vllm.v1.engine.llm_engine import LLMEngine
 from vllm.v1.sample.logits_processor import LogitsProcessor

 if TYPE_CHECKING:
@ -309,11 +308,7 @@ class LLM:
        self.request_counter = Counter()
        self.default_sampling_params: Union[dict[str, Any], None] = None

-        if envs.VLLM_USE_V1:
-            supported_tasks = self.llm_engine \
-                .get_supported_tasks()  # type: ignore
-        else:
-            supported_tasks = self.llm_engine.model_config.supported_tasks
+        supported_tasks = self.llm_engine.get_supported_tasks()  # type: ignore

        logger.info("Supported_tasks: %s", supported_tasks)

@ -1473,8 +1468,6 @@ class LLM:
        Note:
            This method is only available with the V1 LLM engine.
        """
-        from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
-        assert isinstance(self.llm_engine, V1LLMEngine)
        return self.llm_engine.get_metrics()

    def _validate_and_add_requests(
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@ -672,21 +672,15 @@ def tensorize_vllm_model(engine_args: "EngineArgs",
        ) as stream:
            stream.write(encryption_params.key)

-    from vllm import LLMEngine
-    from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+    assert envs.VLLM_USE_V1

-    if not envs.VLLM_USE_V1:
-        engine = LLMEngine.from_engine_args(engine_args)
-        engine.model_executor.collective_rpc(
-            "save_tensorized_model",
-            kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
-        )
-    else:
-        engine = V1LLMEngine.from_vllm_config(engine_config)
-        engine.collective_rpc(
-            "save_tensorized_model",
-            kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
-        )
+    from vllm.v1.engine.llm_engine import LLMEngine
+
+    engine = LLMEngine.from_vllm_config(engine_config)
+    engine.collective_rpc(
+        "save_tensorized_model",
+        kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
+    )


 def tensorize_lora_adapter(lora_path: str,