diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 7f90181048d0f..aa4cc7b35a543 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
   commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi
 
-if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
-  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
-fi
-
 if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
   commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c42ec4f2503d0..1e7ce6ef0a665 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -110,7 +110,7 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
 - label: Entrypoints Integration Test (API Server) # 100min
   timeout_in_minutes: 130
@@ -163,7 +163,6 @@ steps:
   - tests/v1/engine/test_engine_core_client.py
   commands:
   # test with tp=2 and external_dp=2
-  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   # test with tp=2 and pp=2
   - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@@ -314,12 +313,11 @@ steps:
     - python3 offline_inference/vision_language.py --seed 0
     - python3 offline_inference/vision_language_pooling.py --seed 0
     - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     - python3 offline_inference/basic/classify.py
     - python3 offline_inference/basic/embed.py
     - python3 offline_inference/basic/score.py
-    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Platform Tests (CUDA) # 4min
   timeout_in_minutes: 15
@@ -894,7 +892,7 @@ steps:
   - pytest -v -s distributed/test_sequence_parallel.py
   # this test fails consistently.
   # TODO: investigate and fix
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
   - pytest -v -s models/multimodal/generation/test_maverick.py
 
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 323675993467a..f58256d38b9d1 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -5,7 +5,6 @@
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py
deleted file mode 100644
index 392fba8fc5ead..0000000000000
--- a/examples/offline_inference/profiling.py
+++ /dev/null
@@ -1,510 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import inspect
-import json
-import os
-import sys
-from argparse import RawTextHelpFormatter
-from collections.abc import Generator
-from dataclasses import asdict, dataclass
-from typing import Any, Optional, TypeAlias
-
-import torch
-import tqdm
-
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.profiler.layerwise_profile import layerwise_profile
-from vllm.utils import FlexibleArgumentParser
-
-BATCH_SIZE_DEFAULT = 1
-PROMPT_LEN_DEFAULT = 256
-
-
-@dataclass
-class ProfileContext:
-    engine_args: EngineArgs
-    prompt_len: int
-    batch_size: int
-
-    # The profiler can run in 2 modes,
-    # 1. Run profiler for user specified num_steps
-    num_steps: Optional[int] = None
-    # 2. Run profiler until all requests complete
-    complete_num_requests_per_step: Optional[int] = None
-
-    save_chrome_traces_folder: Optional[str] = None
-
-
-def get_dtype(dtype: str):
-    if dtype == "torch.float":
-        return torch.float
-    else:
-        return dtype
-
-
-OutputLen_NumReqs_Map: TypeAlias = dict[int, int]
-
-
-def compute_request_output_lengths(
-    batch_size: int, step_requests: list[int]
-) -> OutputLen_NumReqs_Map:
-    """
-    Given the number of requests, batch_size, and the number of requests
-    that each engine-step should process, step_requests, determine the
-    output lengths of the requests such that step_request is honoured.
-
-    Example:
-    if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
-    then return,
-    {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
-    32 requests should have output length 2,
-    32 requests should have output length 3,
-    32 requests should have output length 4,
-    31 requests should have output length 5,
-    1 request should have output length 6.
-
-    Args:
-        batch_size (int): Number of requests submitted for profile. This is
-            args.batch_size.
-        step_requests (list[int]): step_requests[i] is the number of requests
-            that the ith engine step should process.
-
-    Returns:
-        OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
-            number of requests required to have that output-length as values.
-    """
-    ol_nr: OutputLen_NumReqs_Map = {}
-
-    # Number of request that are assigned an output-length
-    num_reqs_assigned: int = 0
-    num_steps: int = len(step_requests)
-
-    # sanity check. The first step (prefill-step), must process all requests.
-    assert step_requests[0] == batch_size
-
-    # Begin assignments from the last step.
-    output_length: int = num_steps
-    for num_requests_at_step in reversed(step_requests):
-        if num_reqs_assigned == batch_size:
-            break
-
-        assert num_reqs_assigned < batch_size
-
-        # Remove the number of requests that have been determined
-        # to participate in this step and beyond.
-        num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
-        assert num_reqs_unassigned_at_step >= 0
-
-        if num_reqs_unassigned_at_step > 0:
-            ol_nr[output_length] = num_reqs_unassigned_at_step
-            num_reqs_assigned += num_reqs_unassigned_at_step
-
-        output_length -= 1
-
-    # sanity checks.
-    assert sum(ol_nr.values()) == batch_size, (
-        "Number of requests in output-length assignment does not match "
-        f"batch-size.\n batch size {batch_size} - "
-        f"step requests {step_requests} - assignments {ol_nr}"
-    )
-
-    # Check that the output-length is in [1, num-steps]. Output length must be
-    # at least 1 as all requests must participate in the prefill-step.
-    assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), (
-        "Output lengths of requests should be in range "
-        f"[1, num-engine-steps].\n batch size {batch_size} - "
-        f"step requests {step_requests} - assignments {ol_nr}"
-    )
-
-    return ol_nr
-
-
-def determine_requests_per_step(context: ProfileContext) -> list[int]:
-    """
-    Determine number of requests each engine step should process.
-    If context.num_steps is set, then all engine steps process the
-    same number of requests and the output list is of length
-    context.num_steps.
-
-    If context.complete_num_requests_per_step is set, then each decode step
-    processes fewer and fewer requests until there are no requests to process.
-    In this case, the output list is as big as the number of steps
-    required to process all requests.
-
-    Args:
-        context: ProfileContext object.
-
-    Returns:
-        list[int]: Number of requests to process for all engine-steps.
-         output[i], contains the number of requests that the ith step
-         should process.
-    """
-    if context.num_steps:
-        # All requests must run until num_engine_steps. This implies
-        # that their output lengths must be equal to num_engine_steps.
-        return [context.batch_size] * context.num_steps
-
-    assert (
-        context.complete_num_requests_per_step
-        and context.complete_num_requests_per_step > 0
-    ), (
-        f"Expected a positive complete_num_requests_per_step argument."
-        f"Instead got {context.complete_num_requests_per_step}"
-    )
-
-    # We start dropping after the first decode step.
-    step_requests = [
-        context.batch_size,  # prefill
-        context.batch_size,  # decode
-    ]
-
-    num_running_requests = context.batch_size
-    num_running_requests -= context.complete_num_requests_per_step
-    while num_running_requests > 0:
-        step_requests.append(num_running_requests)
-        num_running_requests -= context.complete_num_requests_per_step
-
-    if step_requests[-1] != 1:
-        # have 1 request running at the last step. This is often
-        # useful
-        step_requests.append(1)
-
-    return step_requests
-
-
-def run_profile(
-    context: ProfileContext, csv_output: Optional[str], json_output: Optional[str]
-):
-    print("Run profile with:")
-    for key, value in asdict(context).items():
-        print(f"  {key} = {value}")
-
-    requests_per_step: list[int] = determine_requests_per_step(context)
-
-    ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
-        context.batch_size, requests_per_step
-    )
-
-    num_steps_to_profile: int = len(requests_per_step)
-    max_output_len: int = max(ol_nr.keys())
-    assert max_output_len >= 1
-
-    # Create sampling params
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        # max_tokens is set on a per-request basis.
-        max_tokens=None,
-        ignore_eos=True,
-    )
-
-    # Create LLM
-    llm = LLM(**asdict(context.engine_args))
-    batch_size = context.batch_size
-    prompt_len = context.prompt_len
-
-    scheduler_config = llm.llm_engine.vllm_config.scheduler_config
-    max_model_len = llm.llm_engine.model_config.max_model_len
-    max_num_batched_tokens = scheduler_config.max_num_batched_tokens
-    max_num_seqs = scheduler_config.max_num_seqs
-
-    if batch_size * prompt_len > max_num_batched_tokens:
-        print(
-            f"ERROR: chosen batch_size * prompt_len "
-            f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is  "
-            f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
-            f"and therefore cannot be run in a single profile step, please "
-            f"choose a smaller batch size or prompt length, or increase "
-            f"--max-num-batched-tokens"
-        )
-        sys.exit(-1)
-    if batch_size > max_num_seqs:
-        print(
-            f"ERROR: chosen batch_size ({batch_size}) is larger than "
-            f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
-            f"single profile step, please choose a smaller batch size"
-        )
-        sys.exit(-1)
-    print(
-        "llm.llm_engine.model_config.max_model_len: ",
-        llm.llm_engine.model_config.max_model_len,
-    )
-    if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
-        print(
-            f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
-            f"{max_output_len} = {prompt_len + max_output_len}) is larger "
-            f"than the model's max_model_len ({max_model_len}), please "
-            f"choose a smaller prompt_len or max_output_len, or increase "
-            f"--max-model-len"
-        )
-        sys.exit(-1)
-
-    def add_requests():
-        def get_output_len_generator() -> Generator[int, Any, Any]:
-            for output_len, num_reqs in ol_nr.items():
-                for _ in range(num_reqs):
-                    yield output_len
-
-        output_len_generator = get_output_len_generator()
-        for i in range(batch_size):
-            sampling_params.max_tokens = next(output_len_generator)
-            assert isinstance(sampling_params.max_tokens, int)
-
-            prompt_token_ids = torch.randint(
-                llm.get_tokenizer().vocab_size, size=(prompt_len,)
-            ).tolist()
-
-            llm.llm_engine.add_request(
-                request_id=f"seq{i}",
-                prompt={"prompt_token_ids": prompt_token_ids},
-                params=sampling_params,
-            )
-
-    def abort_requests():
-        for i in range(batch_size):
-            llm.llm_engine.abort_request(f"seq{i}")
-
-    # Warm up run
-    print("Warm up run ...")
-    add_requests()
-    llm.llm_engine.step()  # Prefill
-    llm.llm_engine.step()  # Decode
-    abort_requests()
-
-    print("Profile run ...")
-    add_requests()
-
-    with layerwise_profile() as prefill_prof:
-        llm.llm_engine.step()  # First step is prefill
-
-    decode_profs = []
-    for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
-        num_running_seqs = llm.llm_engine.scheduler[0].get_num_unfinished_seq_groups()
-        with layerwise_profile(num_running_seqs=num_running_seqs) as decode_prof:
-            llm.llm_engine.step()
-        decode_profs.append(decode_prof)
-
-    decode_results_list = [prof.results for prof in decode_profs]
-    prefill_results = prefill_prof.results
-    has_decode = len(decode_results_list) > 0
-
-    LINE_WIDTH = 80
-    print("=" * LINE_WIDTH)
-    print(f"= Prefill Model Table (prompt_len={prompt_len}, batch_size={batch_size})")
-    print("=" * LINE_WIDTH)
-    print()
-    prefill_results.print_model_table()
-
-    if has_decode:
-        print()
-        print("=" * LINE_WIDTH)
-        print(
-            f"= First Decode Step Model Table "
-            f"(prompt_len={prompt_len}, batch_size={batch_size})"
-        )
-        print("=" * LINE_WIDTH)
-        print()
-        decode_results_list[0].print_model_table()
-
-    print()
-    print("=" * LINE_WIDTH)
-    print(f"= Prefill Summary Table (prompt_len={prompt_len}, batch_size={batch_size})")
-    print("=" * LINE_WIDTH)
-    print()
-    prefill_results.print_summary_table()
-
-    if has_decode:
-        print()
-        print("=" * LINE_WIDTH)
-        print(
-            f"= First Decode Step Summary Table "
-            f"(prompt_len={prompt_len}, batch_size={batch_size})"
-        )
-        print("=" * LINE_WIDTH)
-        print()
-        decode_results_list[0].print_summary_table()
-
-    if csv_output:
-        csv_filename_base = (
-            csv_output[:-4] if csv_output.endswith(".csv") else csv_output
-        )
-        prefill_results.export_model_stats_table_csv(
-            csv_filename_base + "_prefill_model_table.csv"
-        )
-        prefill_results.export_summary_stats_table_csv(
-            csv_filename_base + "_prefill_summary_table.csv"
-        )
-
-        if has_decode:
-            decode_results_list[0].export_model_stats_table_csv(
-                csv_filename_base + "_decode_model_table.csv"
-            )
-            decode_results_list[0].export_summary_stats_table_csv(
-                csv_filename_base + "_decode_summary_table.csv"
-            )
-
-    if json_output:
-        cuda_devices = [
-            torch.cuda.get_device_properties(dev_idx)
-            for dev_idx in range(torch.cuda.device_count())
-        ]
-
-        json_dict = {
-            "context": {
-                "python_version": f"{sys.version}",
-                "torch_version": f"{torch.__version__}",
-                "torch_cuda_version": f"{torch.version.cuda}",
-                "cuda_devices": f"{cuda_devices}",
-                **asdict(context),
-            },
-            "prefill": prefill_results.convert_stats_to_dict(),
-        }
-
-        if has_decode:
-            for idx, dr in enumerate(decode_results_list):
-                json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
-
-        # Add .json to json_output filename if it doesn't exist already.
-        json_output_file = (
-            json_output if json_output.endswith(".json") else json_output + ".json"
-        )
-        with open(json_output_file, "w+") as f:
-            json.dump(json_dict, f, indent=2)
-        pass
-
-    if context.save_chrome_traces_folder is not None:
-        os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
-        prefill_prof.profiler.export_chrome_trace(
-            context.save_chrome_traces_folder + "/prefill.json"
-        )
-        for idx, decode_prof in enumerate(decode_profs):
-            decode_prof.profiler.export_chrome_trace(
-                context.save_chrome_traces_folder + f"/decode_{idx + 1}.json"
-            )
-        print(
-            "Traces saved as prefill.json and decode_1.json, etc."
-            f" in folder {context.save_chrome_traces_folder}"
-        )
-
-
-def parse_args():
-    parser = FlexibleArgumentParser(
-        description="""
-Profile a model
-
-    example:
-    ```
-    python examples/offline_inference/profiling.py \\
-        --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
-        --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
-        --enforce-eager run_num_steps -n 2
-    ```
-
-    then you can use various tools to analyze the json output
-    terminal ascii tables:
-        ```
-        python tools/profiler/print_layerwise_table.py \\
-            --json-trace Llama31-8b-FP8.json --phase prefill --table summary
-        ```
-    or create matplotlib stacked bar charts:
-        ```
-        python tools/profiler/visualize_layerwise_profile.py \\
-            --json-trace Llama31-8b-FP8.json \\
-            --output-directory profile_breakdown --plot-metric pct_cuda_time
-        ```
-""",
-        formatter_class=RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--csv",
-        type=str,
-        default=None,
-        help="Export the results as multiple csv file. This should be the root "
-        "filename, will create <filename>_prefill_model_table.csv, "
-        "<filename>_prefill_summary_table.csv, "
-        "<filename>_decode_model_table.csv, and "
-        "<filename>_decode_summary_table.csv",
-    )
-    parser.add_argument(
-        "--json",
-        type=str,
-        default=None,
-        help="Export the results as a json file. This should be the filename",
-    )
-    parser.add_argument(
-        "--save-chrome-traces-folder",
-        type=str,
-        help="Save chrome traces for the prefill and decode "
-        "will save traces as prefill.json and decode_1.json, "
-        "etc. inside this folder",
-    )
-    parser.add_argument(
-        "--prompt-len",
-        type=int,
-        default=PROMPT_LEN_DEFAULT,
-        help=f"Length of the random prompt to use when profiling, all batched "
-        f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}",
-    )
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=BATCH_SIZE_DEFAULT,
-        help=f"Number of requests to run as a single batch, "
-        f"default={BATCH_SIZE_DEFAULT}",
-    )
-
-    subparsers = parser.add_subparsers(dest="cmd")
-
-    run_num_steps_parser = subparsers.add_parser(
-        "run_num_steps", help="This variation profiles n engine.step() invocations."
-    )
-    run_num_steps_parser.add_argument(
-        "-n",
-        "--num-steps",
-        type=int,
-        help="Number of engine steps to profile.\n"
-        "Setting it to 1, profiles only the prefill step.\n"
-        "Setting it to 2, profiles the prefill and first decode step\n"
-        "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
-        "and so on ...",
-    )
-
-    run_to_completion_parser = subparsers.add_parser(
-        "run_to_completion",
-        help="This variation profiles all the engine.step() invocations"
-        "until the engine exhausts all submitted requests.",
-    )
-    run_to_completion_parser.add_argument(
-        "-n",
-        "--complete-num-requests-per-step",
-        type=int,
-        help="Complete complete_num_requests_per_step requests every decode step."
-        "For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
-        "the profiler is run for 6 engine steps, with the steps processing, "
-        "128, 128, 96, 64, 32, 1 requests respectively.\n"
-        "Note that we tack-on a one-request step at the end as it is often "
-        "useful.",
-    )
-
-    EngineArgs.add_cli_args(parser)
-
-    return parser.parse_args()
-
-
-def main(args):
-    context = ProfileContext(
-        engine_args=EngineArgs.from_cli_args(args),
-        **{
-            k: v
-            for k, v in vars(args).items()
-            if k in inspect.signature(ProfileContext).parameters
-        },
-    )
-    run_profile(context, csv_output=args.csv, json_output=args.json)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 24b1c9a93126c..411f3e01bc2cd 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -11,7 +11,7 @@ from unittest.mock import Mock
 import pytest
 import torch
 
-from vllm import LLM, envs
+from vllm import LLM
 from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
 
 from ..conftest import HfRunner, VllmRunner
@@ -26,14 +26,6 @@ MODELS = [
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def test_vllm_gc_ed():
     """Verify vllm instance is GC'ed when it is deleted"""
     llm = LLM("distilbert/distilgpt2")
@@ -76,12 +68,6 @@ def test_models(
     model_executor: str,
     enable_prompt_embeds: bool,
 ) -> None:
-    if not envs.VLLM_USE_V1:
-        if async_scheduling:
-            pytest.skip("async_scheduling only supported in v1.")
-        if model_executor != "uni":
-            pytest.skip("only test uniproc executor for v0.")
-
     if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
         pytest.skip(
             f"{backend} does not support gemma2 with full context length.")
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index f3ad680b72b55..508740ab29389 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -122,11 +122,12 @@ def test_cumem_with_cudagraph():
         # sleep mode with safetensors
         ("meta-llama/Llama-3.2-1B", True),
         # sleep mode with pytorch checkpoint
-        ("facebook/opt-125m", False),
+        ("facebook/opt-125m", True),
     ])
 def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        assert use_v1
+        m.setenv("VLLM_USE_V1", "1")
         free, total = torch.cuda.mem_get_info()
         used_bytes_baseline = total - free  # in case other process is running
         llm = LLM(model, enable_sleep_mode=True)
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 022f183b31932..b6bebbba915be 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -54,8 +54,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
     # Use global backends
     global backend, backend_unfused
 
-    use_v1 = False  # can be made a param once V1 support added
-    monkeypatch.setenv("VLLM_USE_V1", str(int(use_v1)))
+    monkeypatch.setenv("VLLM_USE_V1", "1")
     monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", str(int(use_triton_fa)))
 
     # Prompt 4 seems too open-ended, differs between fused and unfused
diff --git a/tests/conftest.py b/tests/conftest.py
index ce9de3bf94b59..f14b1e8780ad9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -160,26 +160,6 @@ def cleanup_VLLM_USE_V1(monkeypatch):
         monkeypatch.delenv("VLLM_USE_V1")
 
 
-@pytest.fixture(params=[True, False])
-def run_with_both_engines(request, monkeypatch):
-    # Automatically runs tests twice, once with V1 and once without
-    use_v1 = request.param
-    # Tests decorated with `@skip_v1` are only run without v1
-    skip_v0 = request.node.get_closest_marker("skip_v0")
-    skip_v1 = request.node.get_closest_marker("skip_v1")
-
-    if use_v1:
-        if skip_v1:
-            pytest.skip("Skipping test on vllm V1")
-        monkeypatch.setenv('VLLM_USE_V1', '1')
-    else:
-        if skip_v0:
-            pytest.skip("Skipping test on vllm V0")
-        monkeypatch.setenv('VLLM_USE_V1', '0')
-
-    yield
-
-
 @pytest.fixture(autouse=True)
 def init_test_http_connection():
     # pytest_asyncio may use a different event loop per test
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index 3bbbcc755d134..e0ecb02d4f563 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -25,12 +25,6 @@ TOKEN_IDS = [
 ]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py
index 1b7be15d5d691..b219b33d1760e 100644
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -6,14 +6,6 @@ import pytest
 from vllm import LLM
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def test_empty_prompt():
     llm = LLM(model="openai-community/gpt2", enforce_eager=True)
     with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 8917aa5a5efb9..f0b61902eb568 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -432,7 +432,7 @@ def test_metrics_exist_run_batch(use_v1: bool):
             "--port",
             port,
         ],
-                                env={"VLLM_USE_V1": "1" if use_v1 else "0"})
+                                env={"VLLM_USE_V1": "1"})
 
         def is_server_up(url):
             try:
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 190c92e1251c2..f8454ad0a4c4d 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -69,28 +69,20 @@ def generate_params():
 
 @pytest.mark.parametrize("device, name, use_mla, block_size",
                          generate_params())
-@pytest.mark.parametrize("use_v1", [True, False])
 def test_env(
     device: str,
     name: str,
     use_mla: bool,
     block_size: int,
-    use_v1: bool,
     monkeypatch: pytest.MonkeyPatch,
 ):
     """Test attention backend selection with valid device-backend pairs."""
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv("VLLM_USE_V1", "1")
         m.setenv(STR_BACKEND_ENV_VAR, name)
         m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
 
-        if name == "FLASHINFER" and not use_v1:
-            pytest.skip("FlashInfer backend is only available on V1 engine")
-
         if device == "cpu":
-            if not use_v1:
-                pytest.skip("CPU backend only supports V1")
-
             with patch("vllm.attention.selector.current_platform",
                        CpuPlatform()):
                 backend = get_attn_backend(16, torch.float16, None, block_size,
@@ -137,7 +129,7 @@ def test_env(
                                                    block_size,
                                                    False,
                                                    use_mla=use_mla)
-                        expected = f"{name}_VLLM_V1" if use_v1 else name
+                        expected = f"{name}_VLLM_V1"
                         assert backend.get_name() == expected
                 else:
                     backend = get_attn_backend(16,
@@ -146,7 +138,7 @@ def test_env(
                                                block_size,
                                                False,
                                                use_mla=use_mla)
-                    expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
+                    expected = "TRITON_ATTN_VLLM_V1"
                     assert backend.get_name() == expected
 
         elif device == "cuda":
@@ -163,11 +155,7 @@ def test_env(
                     # - TRITON_MLA: fallback for other cases
 
                     if name == "CUTLASS_MLA":
-                        if not use_v1:
-                            # CUTLASS_MLA only supported on V1 engine
-                            pytest.skip(
-                                "CUTLASS_MLA only supported on V1 engine")
-                        elif block_size != 128:
+                        if block_size != 128:
                             # CUTLASS_MLA only supports block_size == 128
                             pytest.skip(
                                 "CUTLASS_MLA only supports block_size 128")
@@ -181,11 +169,7 @@ def test_env(
                             expected = "CUTLASS_MLA_VLLM_V1"
                             assert backend.get_name() == expected
                     elif name == "FLASHINFER_MLA":
-                        if not use_v1:
-                            # FlashInfer MLA only supported on V1 engine
-                            pytest.skip(
-                                "FlashInfer MLA only supported on V1 engine")
-                        elif block_size not in [32, 64]:
+                        if block_size not in [32, 64]:
                             # FlashInfer MLA only supports block_size 32 or 64
                             pytest.skip(
                                 "FlashInfer MLA only supports block_size 32 "
@@ -217,23 +201,17 @@ def test_env(
                                                            block_size,
                                                            False,
                                                            use_mla=use_mla)
-                                expected = f"{name}_VLLM_V1" if use_v1 else name
+                                expected = f"{name}_VLLM_V1"
                                 assert backend.get_name() == expected
                     elif name == "FLASH_ATTN_MLA":
-                        if not use_v1:
-                            # FlashAttention MLA only supported on V1 engine
-                            pytest.skip(
-                                "FlashAttention MLA only supported on V1 engine"
-                            )
-                        else:
-                            backend = get_attn_backend(16,
-                                                       torch.float16,
-                                                       None,
-                                                       block_size,
-                                                       False,
-                                                       use_mla=use_mla)
-                            expected = "FLASH_ATTN_MLA"
-                            assert backend.get_name() == expected
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   None,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        expected = "FLASH_ATTN_MLA"
+                        assert backend.get_name() == expected
                     else:
                         # TRITON_MLA or other fallback
                         backend = get_attn_backend(16,
@@ -242,8 +220,7 @@ def test_env(
                                                    block_size,
                                                    False,
                                                    use_mla=use_mla)
-                        expected = ("TRITON_MLA_VLLM_V1"
-                                    if use_v1 else "TRITON_MLA")
+                        expected = "TRITON_MLA_VLLM_V1"
                         assert backend.get_name() == expected
                 elif name == "FLASHINFER":
                     backend = get_attn_backend(16,
@@ -252,7 +229,7 @@ def test_env(
                                                block_size,
                                                False,
                                                use_mla=use_mla)
-                    expected = "FLASHINFER_VLLM_V1" if use_v1 else name
+                    expected = "FLASHINFER_VLLM_V1"
                     assert backend.get_name() == expected
                 else:
                     backend = get_attn_backend(32,
@@ -261,36 +238,30 @@ def test_env(
                                                block_size,
                                                False,
                                                use_mla=use_mla)
-                    expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+                    expected = "FLASH_ATTN_VLLM_V1"
                     assert backend.get_name() == expected
 
-                    if use_v1:
-                        backend = get_attn_backend(16,
-                                                   torch.float16,
-                                                   None,
-                                                   block_size,
-                                                   False,
-                                                   use_mla=use_mla)
-                        assert backend.get_name() == "FLEX_ATTENTION", (
-                            "Should fallback to FlexAttention if head size is "
-                            "not supported by FlashAttention")
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               None,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    assert backend.get_name() == "FLEX_ATTENTION", (
+                        "Should fallback to FlexAttention if head size is "
+                        "not supported by FlashAttention")
 
 
 @pytest.mark.parametrize("device", ["cpu", "cuda"])
-@pytest.mark.parametrize("use_v1", [True, False])
 def test_fp32_fallback(
     device: str,
-    use_v1: bool,
     monkeypatch: pytest.MonkeyPatch,
 ):
     """Test attention backend selection with fp32."""
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv("VLLM_USE_V1", "1")
 
         if device == "cpu":
-            if not use_v1:
-                pytest.skip("CPU backend only supports V1")
-
             with patch("vllm.attention.selector.current_platform",
                        CpuPlatform()):
                 backend = get_attn_backend(16, torch.float32, None, 16, False)
@@ -300,8 +271,7 @@ def test_fp32_fallback(
             with patch("vllm.attention.selector.current_platform",
                        CudaPlatform()):
                 backend = get_attn_backend(16, torch.float32, None, 16, False)
-            assert (backend.get_name() == "FLEX_ATTENTION"
-                    if use_v1 else "XFORMERS")
+            assert backend.get_name() == "FLEX_ATTENTION"
 
 
 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
@@ -357,12 +327,11 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
         assert backend.get_name() != STR_FLASH_ATTN_VAL
 
 
-@pytest.mark.parametrize("use_v1", [True, False])
-def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
+def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
     """Test that invalid attention backend names raise ValueError."""
     with monkeypatch.context() as m, patch(
             "vllm.attention.selector.current_platform", CudaPlatform()):
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv("VLLM_USE_V1", "1")
         m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
 
         # Should raise ValueError for invalid backend
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index 50c60341f0d88..221d5237823ca 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -6,10 +6,10 @@ Script to test add_lora, remove_lora, pin_lora, list_loras functions.
 import pytest
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.openai.api_server import (
     build_async_engine_client_from_engine_args)
 from vllm.lora.request import LoRARequest
+from vllm.v1.engine.llm_engine import LLMEngine
 
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index c14e71cbdb96d..39c4dd735b725 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -15,7 +15,8 @@ from ...utils import check_logprobs_close
 # have a clean way to fall back, so we fail with
 # a clear msg when it happens.
 # https://github.com/vllm-project/vllm/issues/14524
-REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
+# NOTE(woosuk): Skipping these tests until V1 supports them.
+# REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
 
 # This list contains the model that are using AITER kernel.
 # Skip model that are not using AITER tests.
@@ -113,9 +114,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    if model in REQUIRES_V0:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
     if use_rocm_aiter and (model in AITER_MODEL_LIST):
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
     elif use_rocm_aiter and model not in AITER_MODEL_LIST:
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 206ad1352e06e..0b1f90e27db82 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -8,7 +8,7 @@ from tests.utils import multi_gpu_test
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams
 
-from ...utils import check_logprobs_close, check_outputs_equal
+from ...utils import check_logprobs_close
 
 # Mark all tests as hybrid
 pytestmark = pytest.mark.hybrid_model
@@ -88,15 +88,6 @@ def test_models(
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        if model not in V0_UNSUPPORTED_MODELS:
-            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                    example_prompts, max_tokens, num_logprobs)
-        else:
-            vllm_v0_outputs = None
-
     if model in V1_SUPPORTED_MODELS:
         with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
             vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
@@ -104,14 +95,6 @@ def test_models(
     else:
         vllm_v1_outputs = None
 
-    if vllm_v0_outputs is not None:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_v0_outputs,
-            name_0="hf",
-            name_1="vllm-v0",
-        )
-
     if model in V1_SUPPORTED_MODELS:
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
@@ -157,45 +140,6 @@ def test_batching(
     )
 
 
-@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
-def test_chunked_prefill(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    max_tokens: int,
-    num_logprobs: int,
-    chunked_prefill_token_size: int,
-    monkeypatch,
-) -> None:
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        with vllm_runner(model,
-                         enable_chunked_prefill=True,
-                         max_num_batched_tokens=max_num_batched_tokens,
-                         max_num_seqs=max_num_seqs) as vllm_model:
-            chunked = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-
-        with vllm_runner(model,
-                         enable_chunked_prefill=False,
-                         max_num_seqs=max_num_seqs) as vllm_model:
-            non_chunked = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-
-        check_logprobs_close(
-            outputs_0_lst=chunked,
-            outputs_1_lst=non_chunked,
-            name_0="chunked",
-            name_1="non_chunked",
-        )
-
-
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 @pytest.mark.parametrize("max_tokens", [10])
 def test_chunked_prefill_with_parallel_sampling(
@@ -257,38 +201,6 @@ def test_mamba_cache_cg_padding(
             "Could be related to mamba cache not padded correctly")
 
 
-@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
-@pytest.mark.parametrize("max_tokens", [20])
-def test_models_preemption_recompute(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    max_tokens: int,
-    monkeypatch,
-) -> None:
-    """
-    Tests that outputs are identical with and w/o preemptions (recompute).
-    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-            scheduler = vllm_model.llm.llm_engine.scheduler[0]
-            scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
-            preempt_vllm_outputs = vllm_model.generate_greedy(
-                example_prompts, max_tokens)
-
-            scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=preempt_vllm_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="vllm_preepmtions",
-            name_1="vllm",
-        )
-
-
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
     vllm_runner,
@@ -386,27 +298,10 @@ def test_full_cuda_graph(
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        if model not in V0_UNSUPPORTED_MODELS:
-            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                    example_prompts, max_tokens, num_logprobs)
-        else:
-            vllm_v0_outputs = None
-
     with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
         vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
 
-    if vllm_v0_outputs is not None:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_v0_outputs,
-            name_0="hf",
-            name_1="vllm-v0",
-        )
-
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_v1_outputs,
@@ -442,27 +337,12 @@ def test_fp32_cache_state(
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)
 
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        with vllm_runner(model,
-                         max_num_seqs=MAX_NUM_SEQS,
-                         **{cache_dtype_param: "float32"}) as vllm_model:
-            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-
     with vllm_runner(model,
                      max_num_seqs=MAX_NUM_SEQS,
                      **{cache_dtype_param: "float32"}) as vllm_model:
         vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
 
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_v0_outputs,
-        name_0="hf",
-        name_1="vllm-v0",
-    )
-
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_v1_outputs,
diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py
index 08722ac98b7ed..4ac91b5aed506 100644
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
 
 import pytest
 import torch
@@ -82,7 +81,7 @@ def test_prm_models(
     check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B",
                                max_transformers_version="4.53.2")
 
-    if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
+    if current_platform.is_cpu():
         pytest.skip("CPU only supports V1")
 
     if current_platform.is_rocm():
diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py
index afc27b6e0566e..97dd4d6135ac4 100644
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -36,9 +36,6 @@ from ..utils import check_logprobs_close
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
-# Due to low-precision numerical divergence, this test is too sensitive for
-# the async postprocessor
-@pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_models(
     vllm_runner,
     example_prompts,
@@ -49,7 +46,6 @@ def test_models(
     enforce_eager: bool,
     backend: str,
     tensor_parallel_size: int,
-    disable_async_output_proc: bool,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """
@@ -74,7 +70,6 @@ def test_models(
                 tensor_parallel_size=tensor_parallel_size,
                 enforce_eager=enforce_eager,
                 kv_cache_dtype="auto",
-                disable_async_output_proc=disable_async_output_proc,
         ) as vllm_model:
             baseline_outputs = vllm_model.generate_greedy_logprobs(
                 example_prompts, max_tokens, NUM_LOG_PROBS)
@@ -85,7 +80,6 @@ def test_models(
                 tensor_parallel_size=tensor_parallel_size,
                 enforce_eager=enforce_eager,
                 kv_cache_dtype=kv_cache_dtype,
-                disable_async_output_proc=disable_async_output_proc,
         ) as vllm_model:
             test_outputs = vllm_model.generate_greedy_logprobs(
                 example_prompts, max_tokens, NUM_LOG_PROBS)
@@ -110,9 +104,6 @@ def test_models(
     ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
-# Due to low-precision numerical divergence, this test is too sensitive for
-# the async postprocessor
-@pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_cpu_models(
     vllm_runner,
     example_prompts,
@@ -120,7 +111,6 @@ def test_cpu_models(
     base_model: str,
     test_model: str,
     max_tokens: int,
-    disable_async_output_proc: bool,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """
@@ -138,7 +128,6 @@ def test_cpu_models(
                 max_model_len=MAX_MODEL_LEN,
                 dtype="bfloat16",
                 kv_cache_dtype="auto",
-                disable_async_output_proc=disable_async_output_proc,
         ) as vllm_model:
             baseline_outputs = vllm_model.generate_greedy_logprobs(
                 example_prompts, max_tokens, NUM_LOG_PROBS)
@@ -148,7 +137,6 @@ def test_cpu_models(
                 max_model_len=MAX_MODEL_LEN,
                 dtype="bfloat16",
                 kv_cache_dtype=kv_cache_dtype,
-                disable_async_output_proc=disable_async_output_proc,
         ) as vllm_model:
             test_outputs = vllm_model.generate_greedy_logprobs(
                 example_prompts, max_tokens, NUM_LOG_PROBS)
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 9281579b71e74..b9601114a3183 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -7,7 +7,6 @@ from unittest.mock import patch
 import pytest
 
 from vllm import LLM
-from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
 from vllm.utils import GiB_bytes
 from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
 from vllm.v1.engine.core import EngineCore as V1EngineCore
@@ -61,10 +60,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
                                   False))
 
     # Avoid calling model.forward()
-    def _initialize_kv_caches_v0(self) -> None:
-        self.cache_config.num_gpu_blocks = 0
-        self.cache_config.num_cpu_blocks = 0
-
     def _initialize_kv_caches_v1(self, vllm_config):
         kv_cache_specs = self.model_executor.get_kv_cache_specs()
         scheduler_kv_cache_config = get_kv_cache_configs(
@@ -76,12 +71,12 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
         # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
         return 1, 0, scheduler_kv_cache_config
 
-    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
-                       _initialize_kv_caches_v0),
-          patch.object(V1EngineCore, "_initialize_kv_caches",
+    with (patch.object(V1EngineCore, "_initialize_kv_caches",
                        _initialize_kv_caches_v1), monkeypatch.context() as m):
         if model_info.v0_only:
-            m.setenv("VLLM_USE_V1", "0")
+            # NOTE(woosuk): skip the test for V0-only models
+            return
+
         if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"):
             # Phi4FlashForCausalLM and MotifForCausalLM
             # only supports DIFFERENTIAL_FLASH_ATTN backend
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 4aa7bb7297893..cb30d77c4f0ea 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -42,6 +42,7 @@ def test_oot_registration_text_generation(
             assert rest == ""
 
 
+@pytest.mark.skip(reason="This test is skipped because it failed on V1.")
 @create_new_process_for_each_test()
 def test_oot_registration_embedding(
     monkeypatch: pytest.MonkeyPatch,
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 6e2089ea2e0e2..1d7e4475011d0 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -7,15 +7,6 @@ import torch
 from vllm.plugins import load_general_plugins
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
 def test_platform_plugins():
     # simulate workload by running an example
     import runpy
diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py
index 8c21216108685..099869a82ad21 100644
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -3,47 +3,18 @@
 
 import pytest
 
-from vllm.core.scheduler import Scheduler
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
-from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.engine.llm_engine import LLMEngine
 
 
-class DummyV0Scheduler(Scheduler):
-
-    def schedule(self):
-        raise Exception("Exception raised by DummyV0Scheduler")
-
-
-class DummyV1Scheduler(V1Scheduler):
+class DummyV1Scheduler(Scheduler):
 
     def schedule(self):
         raise Exception("Exception raised by DummyV1Scheduler")
 
 
-def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        with pytest.raises(Exception) as exception_info:
-
-            engine_args = EngineArgs(
-                model="facebook/opt-125m",
-                enforce_eager=True,  # reduce test time
-                scheduler_cls=DummyV0Scheduler,
-            )
-
-            engine = LLMEngine.from_engine_args(engine_args=engine_args)
-
-            sampling_params = SamplingParams(max_tokens=1)
-            engine.add_request("0", "foo", sampling_params)
-            engine.step()
-
-        assert str(
-            exception_info.value) == "Exception raised by DummyV0Scheduler"
-
-
 def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
@@ -59,7 +30,7 @@ def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
                 scheduler_cls=DummyV1Scheduler,
             )
 
-            engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
+            engine = LLMEngine.from_engine_args(engine_args=engine_args)
 
             sampling_params = SamplingParams(max_tokens=1)
             engine.add_request("0", "foo", sampling_params)
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index 0320a5ef31a65..2960ffcbd9eab 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -10,13 +10,6 @@ from transformers import AutoModelForSeq2SeqLM
 
 from vllm.assets.audio import AudioAsset
 
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
 # FIXME(zhuohan): The test can not pass if we:
 #   1. Increase max_tokens to 256.
 #   2. Increase beam_width to 8.
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index ea4a17dd2306f..1d77d37a5d581 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -9,13 +9,6 @@ import pytest
 
 from vllm import SamplingParams
 
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
 # We also test with llama because it has generation_config to specify EOS
 # (past regression).
 MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"]
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index 86fc14dc85f80..220a4a53f4671 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -8,12 +8,6 @@ from vllm import SamplingParams
 MODELS = ["distilbert/distilgpt2"]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_ranks(
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 15ea55afe963b..bd2b91073d568 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -352,58 +352,3 @@ def test_decode_prompt_logprobs(complete_sequence: str,
         logprobs[token_id + 1].decoded_token
         for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
     ])
-
-
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1])
-def test_decode_prompt_logprobs_chunked_prefill(
-    vllm_runner,
-    model,
-    chunked_prefill_token_size: int,
-    example_prompts,
-    monkeypatch,
-):
-    # VLLM V1 does not use incremental detokenization for
-    # prompt logprobs, so this test strategy is irrelevant.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    max_num_seqs = 256
-    enable_chunked_prefill = False
-    max_num_batched_tokens = None
-    if chunked_prefill_token_size != -1:
-        enable_chunked_prefill = True
-        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
-        max_num_batched_tokens = chunked_prefill_token_size
-
-    with vllm_runner(model,
-                     dtype="half",
-                     max_logprobs=5,
-                     gpu_memory_utilization=0.5,
-                     enable_chunked_prefill=enable_chunked_prefill,
-                     max_num_batched_tokens=max_num_batched_tokens,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-
-        vllm_sampling_params = SamplingParams(max_tokens=10,
-                                              logprobs=5,
-                                              prompt_logprobs=5,
-                                              temperature=0.0)
-        vllm_results = vllm_model.llm.generate(
-            example_prompts, sampling_params=vllm_sampling_params)
-
-        for idx, result in enumerate(vllm_results):
-            assert result.prompt_logprobs is not None
-            assert result.prompt_logprobs[0] is None
-
-            # Compared detokenized prompts ids to original prompt.
-            generated_string = ""
-            for (prompt_token,
-                 prompt_logprobs) in zip(result.prompt_token_ids[1:],
-                                         result.prompt_logprobs[1:]):
-                # prompt_logprobs is a dict of the token_id: logprob
-                # We select the token_id corresponding to the actual prompt
-                # Decoded token in the detokenized string corresponding to this
-                # prompt token.
-                generated_string += prompt_logprobs[prompt_token].decoded_token
-
-            assert generated_string == example_prompts[idx], (
-                "Detokenized prompt logprobs do not match original prompt")
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8912ff8bad429..242fcf501bfc4 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1508,14 +1508,6 @@ class EngineArgs:
                                recommend_to_remove=True)
             return False
 
-        if self.kv_cache_dtype != "auto":
-            supported = current_platform.is_kv_cache_dtype_supported(
-                self.kv_cache_dtype, model_config)
-            if not supported:
-                _raise_or_fallback(feature_name="--kv-cache-dtype",
-                                   recommend_to_remove=False)
-                return False
-
         # No Mamba or Encoder-Decoder so far.
         if not model_config.is_v1_compatible:
             _raise_or_fallback(feature_name=model_config.architectures,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 014bc56bc8ece..a0fe38eb320d6 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,1835 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import time
-from collections import Counter as collectionsCounter
-from collections import deque
-from contextlib import contextmanager
-from dataclasses import dataclass
-from functools import partial
-from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
-                    Iterable, List, Literal, Mapping, NamedTuple, Optional)
-from typing import Sequence as GenericSequence
-from typing import Set, Type, Union, cast
+from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
 
-import torch
-import torch.nn as nn
-from typing_extensions import TypeVar
-
-import vllm.envs as envs
-from vllm.config import (LoRAConfig, ModelConfig, ObservabilityConfig,
-                         ParallelConfig, SchedulerConfig, VllmConfig)
-from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs
-from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.metrics_types import StatLoggerBase, Stats
-from vllm.engine.output_processor.interfaces import (
-    SequenceGroupOutputProcessor)
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.entrypoints.openai.logits_processors import (
-    get_logits_processors as get_openai_logits_processors)
-from vllm.executor.executor_base import ExecutorBase
-from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
-from vllm.inputs.parse import split_enc_dec_inputs
-from vllm.inputs.preprocess import InputPreprocessor
-from vllm.logger import init_logger
-from vllm.logits_process import get_bad_words_logits_processors
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.multimodal.cache import processor_only_cache_from_config
-from vllm.multimodal.processing import EncDecMultiModalProcessor
-from vllm.outputs import (PoolingRequestOutput, RequestOutput,
-                          RequestOutputFactory)
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
-from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
-                           Sequence, SequenceGroup, SequenceGroupBase,
-                           SequenceGroupMetadata, SequenceGroupOutput,
-                           SequenceStatus)
-from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
-                          init_tracer)
-from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
-                                               init_tokenizer_from_configs)
-from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
-                                  usage_message)
-from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind
-from vllm.version import __version__ as VLLM_VERSION
-from vllm.worker.model_runner_base import InputProcessingError
-from vllm.worker.worker_base import WorkerBase
-
-logger = init_logger(__name__)
-_LOCAL_LOGGING_INTERVAL_SEC = 5
-
-_O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
-_R = TypeVar("_R", default=Any)
-
-
-@dataclass
-class SchedulerOutputState:
-    """Caches the scheduler outputs for a virtual engine. Used for Multi-Step"""
-    seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None
-    scheduler_outputs: Optional[SchedulerOutputs] = None
-    allow_async_output_proc: bool = False
-    last_output: Optional[SamplerOutput] = None
-
-
-class OutputData(NamedTuple):
-    outputs: List[SamplerOutput]
-    seq_group_metadata_list: List[SequenceGroupMetadata]
-    scheduler_outputs: SchedulerOutputs
-    is_async: bool
-    is_last_step: bool
-    # Indicates if this output is from the first step of the
-    # multi-step. When multi-step is disabled, this is always
-    # set to True.
-    # is_first_step_output is invalid when `outputs` has
-    # outputs from multiple steps.
-    is_first_step_output: Optional[bool]
-    skip: List[int]
-
-
-class SchedulerContext:
-
-    def __init__(self) -> None:
-        self.output_queue: Deque[OutputData] = deque()
-        self.request_outputs: List[RequestOutput] = []
-        self.seq_group_metadata_list: Optional[
-            List[SequenceGroupMetadata]] = None
-        self.scheduler_outputs: Optional[SchedulerOutputs] = None
-
-    def append_output(self, outputs: List[SamplerOutput],
-                      seq_group_metadata_list: List[SequenceGroupMetadata],
-                      scheduler_outputs: SchedulerOutputs, is_async: bool,
-                      is_last_step: bool,
-                      is_first_step_output: Optional[bool]):
-        self.output_queue.append(
-            OutputData(outputs=outputs,
-                       seq_group_metadata_list=seq_group_metadata_list,
-                       scheduler_outputs=scheduler_outputs,
-                       is_async=is_async,
-                       is_last_step=is_last_step,
-                       is_first_step_output=is_first_step_output,
-                       skip=[]))
-
-
-class LLMEngine:
-    """An LLM engine that receives requests and generates texts.
-
-    This is the main class for the vLLM engine. It receives requests
-    from clients and generates texts from the LLM. It includes a tokenizer, a
-    language model (possibly distributed across multiple GPUs), and GPU memory
-    space allocated for intermediate states (aka KV cache). This class utilizes
-    iteration-level scheduling and efficient memory management to maximize the
-    serving throughput.
-
-    The [`LLM`][vllm.LLM] class wraps this class for offline batched inference
-    and the [`AsyncLLMEngine`][vllm.engine.async_llm_engine.AsyncLLMEngine]
-    class wraps this class for online serving.
-
-    The config arguments are derived from [`EngineArgs`][vllm.EngineArgs].
-
-    Args:
-        vllm_config: The configuration for initializing and running vLLM.
-        executor_class: The model executor class for managing distributed
-            execution.
-        log_stats: Whether to log statistics.
-        usage_context: Specified entry point, used for usage info collection.
-    """
-
-    DO_VALIDATE_OUTPUT: ClassVar[bool] = False
-    """A flag to toggle whether to validate the type of request output."""
-
-    @classmethod
-    @contextmanager
-    def enable_output_validation(cls):
-        cls.DO_VALIDATE_OUTPUT = True
-
-        yield
-
-        cls.DO_VALIDATE_OUTPUT = False
-
-    @classmethod
-    def validate_output(
-        cls,
-        output: object,
-        output_type: Type[_O],
-    ) -> _O:
-        do_validate = cls.DO_VALIDATE_OUTPUT
-
-        if ((TYPE_CHECKING or do_validate)
-                and not isinstance(output, output_type)):
-            raise TypeError(f"Expected output of type {output_type}, "
-                            f"but found type {type(output)}")
-
-        return cast(_O, output)
-
-    @classmethod
-    def validate_outputs(
-        cls,
-        outputs: GenericSequence[object],
-        output_type: Type[_O],
-    ) -> List[_O]:
-        do_validate = cls.DO_VALIDATE_OUTPUT
-
-        outputs_: List[_O]
-        if TYPE_CHECKING or do_validate:
-            outputs_ = []
-            for output in outputs:
-                if not isinstance(output, output_type):
-                    raise TypeError(f"Expected output of type {output_type}, "
-                                    f"but found type {type(output)}")
-
-                outputs_.append(output)
-        else:
-            outputs_ = outputs
-
-        return outputs_
-
-    tokenizer: Optional[AnyTokenizer]
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        executor_class: Type[ExecutorBase],
-        log_stats: bool,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
-        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-        use_cached_outputs: bool = False,
-    ) -> None:
-        if envs.VLLM_USE_V1:
-            raise ValueError(
-                "Using V0 LLMEngine, but envs.VLLM_USE_V1=True. "
-                "This should not happen. As a workaround, try using "
-                "LLMEngine.from_vllm_config(...) or explicitly set "
-                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
-
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.parallel_config = vllm_config.parallel_config
-        self.scheduler_config = vllm_config.scheduler_config
-        self.device_config = vllm_config.device_config
-        self.speculative_config = vllm_config.speculative_config  # noqa
-        self.load_config = vllm_config.load_config
-        self.structured_outputs_config = vllm_config.structured_outputs_config
-        self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
-        )
-
-        logger.info(
-            "Initializing a V0 LLM engine (v%s) with config: %s, "
-            "use_cached_outputs=%s, ",
-            VLLM_VERSION,
-            vllm_config,
-            use_cached_outputs,
-        )
-
-        self.log_stats = log_stats
-        self.use_cached_outputs = use_cached_outputs
-
-        if self.model_config.skip_tokenizer_init:
-            self.tokenizer = None
-            self.detokenizer = None
-        else:
-            self.tokenizer = self._init_tokenizer()
-            self.detokenizer = Detokenizer(self.tokenizer)
-
-        self.seq_counter = Counter()
-        self.generation_config_fields = (
-            self.model_config.try_get_generation_config())
-
-        self.input_preprocessor = InputPreprocessor(
-            self.model_config,
-            self.tokenizer,
-            mm_registry,
-            mm_processor_cache=processor_only_cache_from_config(
-                self.model_config, mm_registry),
-        )
-
-        self.model_executor = executor_class(vllm_config=vllm_config)
-
-        self._initialize_kv_caches()
-
-        # If usage stat is enabled, collect relevant info.
-        if is_usage_stats_enabled():
-            from vllm.model_executor.model_loader import (
-                get_architecture_class_name)
-            usage_message.report_usage(
-                get_architecture_class_name(self.model_config),
-                usage_context,
-                extra_kvs={
-                    # Common configuration
-                    "dtype":
-                    str(self.model_config.dtype),
-                    "tensor_parallel_size":
-                    self.parallel_config.tensor_parallel_size,
-                    "block_size":
-                    self.cache_config.block_size,
-                    "gpu_memory_utilization":
-                    self.cache_config.gpu_memory_utilization,
-                    "kv_cache_memory_bytes":
-                    self.cache_config.kv_cache_memory_bytes,
-                    # Quantization
-                    "quantization":
-                    self.model_config.quantization,
-                    "kv_cache_dtype":
-                    str(self.cache_config.cache_dtype),
-
-                    # Feature flags
-                    "enable_lora":
-                    bool(self.lora_config),
-                    "enable_prefix_caching":
-                    self.cache_config.enable_prefix_caching,
-                    "enforce_eager":
-                    self.model_config.enforce_eager,
-                    "disable_custom_all_reduce":
-                    self.parallel_config.disable_custom_all_reduce,
-                })
-
-        self.cached_scheduler_outputs = [
-            SchedulerOutputState()
-            for _ in range(self.parallel_config.pipeline_parallel_size)
-        ]
-
-        self.scheduler_contexts = [
-            SchedulerContext()
-            for _ in range(self.parallel_config.pipeline_parallel_size)
-        ]
-
-        if self.model_config.use_async_output_proc:
-            process_model_outputs = weak_bind(self._process_model_outputs)
-
-            self.async_callbacks = [
-                partial(process_model_outputs,
-                        ctx=self.scheduler_contexts[v_id])
-                for v_id in range(self.parallel_config.pipeline_parallel_size)
-            ]
-        else:
-            self.async_callbacks = []
-
-        # Currently used by AsyncLLMEngine to ensure quick append
-        # of request outputs to asyncio queues
-        self.process_request_outputs_callback: Optional[Callable] = None
-
-        # Create the scheduler.
-        # NOTE: the cache_config here have been updated with the numbers of
-        # GPU and CPU blocks, which are profiled in the distributed executor.
-        if isinstance(self.vllm_config.scheduler_config.scheduler_cls, str):
-            Scheduler = resolve_obj_by_qualname(
-                self.vllm_config.scheduler_config.scheduler_cls)
-        else:
-            Scheduler = self.vllm_config.scheduler_config.scheduler_cls
-        self.scheduler = [
-            Scheduler(
-                self.scheduler_config, self.cache_config, self.lora_config,
-                self.parallel_config.pipeline_parallel_size,
-                self.async_callbacks[v_id]
-                if self.model_config.use_async_output_proc else None)
-            for v_id in range(self.parallel_config.pipeline_parallel_size)
-        ]
-
-        # Metric Logging.
-        if self.log_stats:
-            if stat_loggers is not None:
-                self.stat_loggers = stat_loggers
-            else:
-                # Lazy import for prometheus multiprocessing.
-                # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
-                # before prometheus_client is imported.
-                # See https://prometheus.github.io/client_python/multiprocess/
-                from vllm.engine.metrics import (LoggingStatLogger,
-                                                 PrometheusStatLogger)
-
-                self.stat_loggers = {
-                    "logging":
-                    LoggingStatLogger(
-                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
-                        vllm_config=vllm_config),
-                    "prometheus":
-                    PrometheusStatLogger(
-                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
-                        labels=dict(
-                            model_name=self.model_config.served_model_name),
-                        vllm_config=vllm_config),
-                }
-                self.stat_loggers["prometheus"].info("cache_config",
-                                                     self.cache_config)
-
-        self.tracer = None
-        if self.observability_config.otlp_traces_endpoint:
-            self.tracer = init_tracer(
-                "vllm.llm_engine",
-                self.observability_config.otlp_traces_endpoint)
-
-        # Initialize reasoning parser if reasoning backend is set.
-        if self.structured_outputs_config.reasoning_parser and self.tokenizer:
-            reasoner_class = ReasoningParserManager.get_reasoning_parser(
-                self.structured_outputs_config.reasoning_parser)
-            self.reasoner: ReasoningParser = reasoner_class(
-                self.tokenizer.get_lora_tokenizer())
-
-        # Create sequence output processor, e.g. for beam search or
-        # speculative decoding.
-        self.output_processor = (
-            SequenceGroupOutputProcessor.create_output_processor(
-                self.scheduler_config,
-                self.detokenizer,
-                self.scheduler,
-                self.seq_counter,
-                stop_checker=StopChecker(
-                    self.scheduler_config.max_model_len,
-                    self.reasoner
-                    if self.structured_outputs_config.reasoning_parser
-                    and self.tokenizer else None,
-                ),
-            ))
-
-        self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {}
-
-        # Flag to set when an input fails to process and the engine should run
-        # the next step without re-scheduling.
-        self._skip_scheduling_next_step = False
-
-        # Don't keep the dummy data in memory
-        self.reset_mm_cache()
-
-    def _initialize_kv_caches(self) -> None:
-        """Initialize the KV cache in the worker(s).
-
-        The workers will determine the number of blocks in both the GPU cache
-        and the swap CPU cache.
-        """
-        start = time.time()
-        num_gpu_blocks, num_cpu_blocks = (
-            self.model_executor.determine_num_available_blocks())
-
-        if self.cache_config.num_gpu_blocks_override is not None:
-            num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
-            logger.info(
-                "Overriding num_gpu_blocks=%d with "
-                "num_gpu_blocks_override=%d", num_gpu_blocks,
-                num_gpu_blocks_override)
-            num_gpu_blocks = num_gpu_blocks_override
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-        self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-        elapsed = time.time() - start
-        logger.info(("init engine (profile, create kv cache, "
-                     "warmup model) took %.2f seconds"), elapsed)
-
-    @classmethod
-    def _get_executor_cls(cls,
-                          engine_config: VllmConfig) -> Type[ExecutorBase]:
-        # distributed_executor_backend must be set in VllmConfig.__post_init__
-        distributed_executor_backend = (
-            engine_config.parallel_config.distributed_executor_backend)
-        # Initialize the cluster and specify the executor class.
-        if isinstance(distributed_executor_backend, type):
-            if not issubclass(distributed_executor_backend, ExecutorBase):
-                raise TypeError(
-                    "distributed_executor_backend must be a subclass of "
-                    f"ExecutorBase. Got {distributed_executor_backend}.")
-            executor_class = distributed_executor_backend
-        elif distributed_executor_backend == "ray":
-            from vllm.executor.ray_distributed_executor import (
-                RayDistributedExecutor)
-            executor_class = RayDistributedExecutor
-        elif distributed_executor_backend == "mp":
-            from vllm.executor.mp_distributed_executor import (
-                MultiprocessingDistributedExecutor)
-            assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
-                "multiprocessing distributed executor backend does not "
-                "support VLLM_USE_RAY_SPMD_WORKER=1")
-            executor_class = MultiprocessingDistributedExecutor
-        elif distributed_executor_backend == "uni":
-            # JAX-style, single-process, multi-device executor.
-            from vllm.executor.uniproc_executor import UniProcExecutor
-            executor_class = UniProcExecutor
-        elif distributed_executor_backend == "external_launcher":
-            # executor with external launcher
-            from vllm.executor.uniproc_executor import (  # noqa
-                ExecutorWithExternalLauncher)
-            executor_class = ExecutorWithExternalLauncher
-        else:
-            raise ValueError("unrecognized distributed_executor_backend: "
-                             f"{distributed_executor_backend}")
-        return executor_class
-
-    @classmethod
-    def from_vllm_config(
-        cls,
-        vllm_config: VllmConfig,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
-        disable_log_stats: bool = False,
-    ) -> "LLMEngine":
-        return cls(
-            vllm_config=vllm_config,
-            executor_class=cls._get_executor_cls(vllm_config),
-            log_stats=(not disable_log_stats),
-            usage_context=usage_context,
-            stat_loggers=stat_loggers,
-        )
-
-    @classmethod
-    def from_engine_args(
-        cls,
-        engine_args: EngineArgs,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
-    ) -> "LLMEngine":
-        """Creates an LLM engine from the engine arguments."""
-        # Create the engine configs.
-        vllm_config = engine_args.create_engine_config(usage_context)
-
-        engine_cls = cls
-        if envs.VLLM_USE_V1:
-            from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
-            engine_cls = V1LLMEngine
-
-        return engine_cls.from_vllm_config(
-            vllm_config=vllm_config,
-            usage_context=usage_context,
-            stat_loggers=stat_loggers,
-            disable_log_stats=engine_args.disable_log_stats,
-        )
-
-    def __reduce__(self):
-        # This is to ensure that the LLMEngine is not referenced in
-        # the closure used to initialize Ray worker actors
-        raise RuntimeError("LLMEngine should not be pickled!")
-
-    def __del__(self):
-        # Shutdown model executor when engine is garbage collected
-        # Use getattr since __init__ can fail before the field is set
-        if model_executor := getattr(self, "model_executor", None):
-            model_executor.shutdown()
-
-    def get_tokenizer(self) -> AnyTokenizer:
-        if self.tokenizer is None:
-            raise ValueError("Unable to get tokenizer because "
-                             "skip_tokenizer_init is True")
-
-        return self.tokenizer
-
-    def _init_tokenizer(self) -> AnyTokenizer:
-        return init_tokenizer_from_configs(model_config=self.model_config)
-
-    def _verify_args(self) -> None:
-        self.model_config.verify_with_parallel_config(self.parallel_config)
-        self.cache_config.verify_with_parallel_config(self.parallel_config)
-        if self.lora_config:
-            self.lora_config.verify_with_model_config(self.model_config)
-            self.lora_config.verify_with_scheduler_config(
-                self.scheduler_config)
-
-    def _add_processed_request(
-        self,
-        request_id: str,
-        processed_inputs: ProcessorInputs,
-        params: SamplingParams,
-        arrival_time: float,
-        lora_request: Optional[LoRARequest],
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-    ) -> Optional[SequenceGroup]:
-        """Add a processed request to the engine's request pool.
-        return the created sequence group.
-        """
-        if isinstance(params, SamplingParams) and params.n > 1:
-            ParallelSampleSequenceGroup.add_request(
-                request_id,
-                self,
-                params,
-                processed_inputs=processed_inputs,
-                arrival_time=arrival_time,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-                priority=priority,
-            )
-            return None
-
-        self._validate_model_inputs(processed_inputs)
-        # Create the sequences.
-        block_size = self.cache_config.block_size
-        seq_id = next(self.seq_counter)
-        eos_token_id = self.input_preprocessor.get_eos_token_id()
-
-        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
-
-        seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
-                       lora_request)
-
-        encoder_seq = (None if encoder_inputs is None else Sequence(
-            seq_id, encoder_inputs, block_size, eos_token_id, lora_request))
-
-        # Create a SequenceGroup based on SamplingParams
-        if isinstance(params, SamplingParams):
-            seq_group = self._create_sequence_group_with_sampling(
-                request_id,
-                seq,
-                params,
-                arrival_time=arrival_time,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-                encoder_seq=encoder_seq,
-                priority=priority)
-        else:
-            raise ValueError("SamplingParams must be provided.")
-
-        # Add the sequence group to the scheduler with least unfinished seqs.
-        costs = [
-            scheduler.get_num_unfinished_seq_groups()
-            for scheduler in self.scheduler
-        ]
-        min_cost_scheduler = self.scheduler[costs.index(min(costs))]
-        min_cost_scheduler.add_seq_group(seq_group)
-
-        return seq_group
-
-    def stop_remote_worker_execution_loop(self) -> None:
-        self.model_executor.stop_remote_worker_execution_loop()
-
-    def add_request(
-        self,
-        request_id: str,
-        prompt: PromptType,
-        params: SamplingParams,
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-    ) -> None:
-        """Add a request to the engine's request pool.
-
-        The request is added to the request pool and will be processed by the
-        scheduler as `engine.step()` is called. The exact scheduling policy is
-        determined by the scheduler.
-
-        Args:
-            request_id: The unique ID of the request.
-            prompt: The prompt to the LLM. See
-                [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each input.
-            params: Parameters for sampling.
-                [SamplingParams][vllm.SamplingParams] for text generation.
-            arrival_time: The arrival time of the request. If None, we use
-                the current monotonic time.
-            lora_request: The LoRA request to add.
-            trace_headers: OpenTelemetry trace headers.
-            priority: The priority of the request.
-                Only applicable with priority scheduling.
-
-        Details:
-            - Set arrival_time to the current time if it is None.
-            - Set prompt_token_ids to the encoded prompt if it is None.
-            - Create `n` number of [Sequence][vllm.sequence.Sequence] objects.
-            - Create a [SequenceGroup][vllm.sequence.SequenceGroup] object
-              from the list of [Sequence][vllm.sequence.Sequence].
-            - Add the [SequenceGroup][vllm.sequence.SequenceGroup] object to the
-              scheduler.
-
-        Example:
-            >>> # initialize engine
-            >>> engine = LLMEngine.from_engine_args(engine_args)
-            >>> # set request arguments
-            >>> example_prompt = "Who is the president of the United States?"
-            >>> sampling_params = SamplingParams(temperature=0.0)
-            >>> request_id = 0
-            >>>
-            >>> # add the request to the engine
-            >>> engine.add_request(
-            >>>    str(request_id),
-            >>>    example_prompt,
-            >>>    SamplingParams(temperature=0.0))
-            >>> # continue the request processing
-            >>> ...
-        """
-        if not isinstance(request_id, str):
-            raise TypeError(
-                f"request_id must be a string, got {type(request_id)}")
-
-        if lora_request is not None and not self.lora_config:
-            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
-                             "not enabled!")
-
-        if priority != 0 and not self.scheduler_config.policy == "priority":
-            raise ValueError(f"Got priority {priority} but "
-                             "Priority scheduling is not enabled.")
-
-        if isinstance(params, SamplingParams) \
-            and params.logits_processors:
-            raise ValueError(
-                "Logits processors are not supported in multi-step decoding")
-
-        if arrival_time is None:
-            arrival_time = time.time()
-
-        if (isinstance(prompt, dict)
-                and prompt.get("prompt_embeds", None) is not None):
-            if not prompt.get("prompt_token_ids", None):
-                seq_len = prompt["prompt_embeds"].shape[0]
-                prompt["prompt_token_ids"] = [0] * seq_len
-            if params.prompt_logprobs is not None:
-                raise ValueError(
-                    "prompt_logprobs is not compatible with prompt embeds.")
-
-        processed_inputs = self.input_preprocessor.preprocess(
-            prompt,
-            tokenization_kwargs=tokenization_kwargs,
-        )
-
-        self._add_processed_request(
-            request_id=request_id,
-            processed_inputs=processed_inputs,
-            params=params,
-            arrival_time=arrival_time,
-            lora_request=lora_request,
-            trace_headers=trace_headers,
-            priority=priority,
-        )
-
-    def _create_sequence_group_with_sampling(
-        self,
-        request_id: str,
-        seq: Sequence,
-        sampling_params: SamplingParams,
-        arrival_time: float,
-        lora_request: Optional[LoRARequest],
-        trace_headers: Optional[Mapping[str, str]] = None,
-        encoder_seq: Optional[Sequence] = None,
-        priority: int = 0,
-    ) -> SequenceGroup:
-        """Creates a SequenceGroup with SamplingParams."""
-        max_logprobs = self.get_model_config().max_logprobs
-        if (sampling_params.logprobs
-                and sampling_params.logprobs > max_logprobs) or (
-                    sampling_params.prompt_logprobs
-                    and sampling_params.prompt_logprobs > max_logprobs):
-            raise ValueError(f"Cannot request more than "
-                             f"{max_logprobs} logprobs.")
-
-        sampling_params = self._build_logits_processors(
-            sampling_params, lora_request)
-
-        # Defensive copy of SamplingParams, which are used by the sampler,
-        # this doesn't deep-copy LogitsProcessor objects
-        sampling_params = sampling_params.clone()
-
-        sampling_params.update_from_generation_config(
-            self.generation_config_fields, seq.eos_token_id)
-
-        # Create the sequence group.
-        draft_size = 1
-        if self.vllm_config.speculative_config is not None:
-            draft_size = \
-                self.vllm_config.speculative_config.num_speculative_tokens + 1
-        seq_group = SequenceGroup(request_id=request_id,
-                                  seqs=[seq],
-                                  arrival_time=arrival_time,
-                                  sampling_params=sampling_params,
-                                  lora_request=lora_request,
-                                  trace_headers=trace_headers,
-                                  encoder_seq=encoder_seq,
-                                  priority=priority,
-                                  draft_size=draft_size)
-
-        return seq_group
-
-    def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
-        """Aborts a request(s) with the given ID.
-
-        Args:
-            request_id: The ID(s) of the request to abort.
-
-        Details:
-            - Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][].
-
-        Example:
-            >>> # initialize engine and add a request with request_id
-            >>> request_id = str(0)
-            >>> # abort the request
-            >>> engine.abort_request(request_id)
-        """
-        for scheduler in self.scheduler:
-            scheduler.abort_seq_group(
-                request_id, seq_id_to_seq_group=self.seq_id_to_seq_group)
-
-    def get_vllm_config(self) -> VllmConfig:
-        """Gets the vllm configuration."""
-        return self.vllm_config
-
-    def get_model_config(self) -> ModelConfig:
-        """Gets the model configuration."""
-        return self.model_config
-
-    def get_parallel_config(self) -> ParallelConfig:
-        """Gets the parallel configuration."""
-        return self.parallel_config
-
-    def get_scheduler_config(self) -> SchedulerConfig:
-        """Gets the scheduler configuration."""
-        return self.scheduler_config
-
-    def get_lora_config(self) -> LoRAConfig:
-        """Gets the LoRA configuration."""
-        return self.lora_config
-
-    def get_num_unfinished_requests(self) -> int:
-        """Gets the number of unfinished requests."""
-        return sum(scheduler.get_num_unfinished_seq_groups()
-                   for scheduler in self.scheduler)
-
-    def has_unfinished_requests(self) -> bool:
-        """Returns True if there are unfinished requests."""
-        return any(scheduler.has_unfinished_seqs()
-                   for scheduler in self.scheduler)
-
-    def has_unfinished_requests_for_virtual_engine(
-            self, virtual_engine: int) -> bool:
-        """
-        Returns True if there are unfinished requests for the virtual engine.
-        """
-        return self.scheduler[virtual_engine].has_unfinished_seqs()
-
-    def reset_mm_cache(self) -> bool:
-        """Reset the multi-modal cache."""
-        self.input_preprocessor.clear_cache()
-        return True
-
-    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
-        """Reset prefix cache for all devices."""
-
-        success = True
-        for scheduler in self.scheduler:
-            success = success and scheduler.reset_prefix_cache(device)
-        return success
-
-    def _process_model_outputs(self,
-                               ctx: SchedulerContext,
-                               request_id: Optional[str] = None) -> None:
-        """Apply the model output to the sequences in the scheduled seq groups
-        and return responses.
-
-        ctx: The virtual engine context to work on
-        request_id: If provided, then only this request is going to be processed
-        """
-
-        now = time.time()
-
-        if len(ctx.output_queue) == 0:
-            return None
-
-        # Get pending async postprocessor
-        if request_id:
-            # When we process only one request, no pop is required
-            # (since later we will process all of the rest)
-            (outputs, seq_group_metadata_list, scheduler_outputs, is_async,
-             is_last_step, is_first_step_output, skip) = ctx.output_queue[0]
-        else:
-            (outputs, seq_group_metadata_list, scheduler_outputs, is_async,
-             is_last_step, is_first_step_output,
-             skip) = ctx.output_queue.popleft()
-
-        # Sanity check
-        assert len(seq_group_metadata_list) == len(
-            scheduler_outputs.scheduled_seq_groups)
-
-        has_multiple_outputs: bool = len(outputs) > 1
-        outputs_by_sequence_group: List[List[SequenceGroupOutput]]
-        assert not has_multiple_outputs
-        outputs_by_sequence_group = outputs
-
-        # Determine the requests we need to operate on
-        if request_id:
-            indices = []
-            for i, seq_group_meta in enumerate(seq_group_metadata_list):
-                if seq_group_meta.request_id == request_id:
-                    assert i not in skip  # Cannot be called twice
-                    indices.append(i)
-                    break
-
-            # If the request_id was not found, then it means that
-            # this is a new request that has no pending async
-            # postprocessor
-            if not indices:
-                return
-        else:
-            indices = range(len(seq_group_metadata_list))  # type: ignore
-
-        finished_before: List[int] = []
-        finished_now: List[int] = []
-        for i in indices:
-            if i in skip:
-                continue
-
-            seq_group_meta = seq_group_metadata_list[i]
-            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
-
-            seq_group: SequenceGroup = scheduled_seq_group.seq_group
-
-            if seq_group.is_finished():
-                finished_before.append(i)
-                continue
-
-            output: List[SequenceGroupOutput]
-            if has_multiple_outputs:
-                output = outputs_by_sequence_group[i]
-            else:
-                output = [outputs_by_sequence_group[0][i]]
-
-            if not is_async:
-                seq_group.update_num_computed_tokens(
-                    seq_group_meta.token_chunk_size or 0)
-
-            if outputs:
-                for o in outputs:
-                    if (isinstance(o, SamplerOutput)
-                            and seq_group.metrics is not None):
-                        if seq_group.metrics.model_forward_time is not None:
-                            seq_group.metrics.model_forward_time += (
-                                o.model_forward_time or 0)
-                        else:
-                            seq_group.metrics.model_forward_time = (
-                                o.model_forward_time)
-                        if seq_group.metrics.model_execute_time is not None:
-                            seq_group.metrics.model_execute_time += (
-                                o.model_execute_time or 0)
-                        else:
-                            seq_group.metrics.model_execute_time = (
-                                o.model_execute_time)
-
-            self.output_processor.process_prompt_logprob(seq_group, output)
-            if seq_group_meta.do_sample:
-                self.output_processor.process_outputs(seq_group, output,
-                                                      is_async)
-
-            if seq_group.is_finished():
-                finished_now.append(i)
-
-        # Generate outputs for the requests that finished this iteration
-        for i in finished_now:
-            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
-
-            seq_group = scheduled_seq_group.seq_group
-            seq_group.maybe_set_first_token_time(now)
-            if not seq_group.is_prefill():
-                seq_group.set_last_token_time(now)
-            request_output = RequestOutputFactory.create(
-                seq_group,
-                self.seq_id_to_seq_group,
-                use_cache=self.use_cached_outputs)
-            if request_output:
-                ctx.request_outputs.append(request_output)
-
-        # When we process a single request, we skip it for the next time,
-        # and invoke the request output callback (if there was final output)
-        if request_id:
-            assert len(indices) == 1
-            skip.append(indices[0])
-
-            if (finished_now
-                    and self.process_request_outputs_callback is not None):
-                self.process_request_outputs_callback(ctx.request_outputs)
-                ctx.request_outputs.clear()
-            return
-
-        # Free currently finished requests
-        if finished_now:
-            for scheduler in self.scheduler:
-                scheduler.free_finished_seq_groups()
-
-        # Create the outputs
-        for i in indices:
-            if i in skip or i in finished_before or i in finished_now:
-                continue  # Avoids double processing
-
-            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
-
-            seq_group = scheduled_seq_group.seq_group
-            seq_group.maybe_set_first_token_time(now)
-            if not seq_group.is_prefill():
-                seq_group.set_last_token_time(now)
-            request_output = RequestOutputFactory.create(
-                seq_group,
-                self.seq_id_to_seq_group,
-                use_cache=self.use_cached_outputs)
-            if request_output:
-                ctx.request_outputs.append(request_output)
-
-        # Create outputs only after processing the scheduler's results
-
-        for seq_group in scheduler_outputs.ignored_seq_groups:
-            params = seq_group.sampling_params
-            if params is not None and params.output_kind == (
-                    RequestOutputKind.DELTA) and not seq_group.is_finished():
-                continue
-
-            request_output = RequestOutputFactory.create(
-                seq_group,
-                self.seq_id_to_seq_group,
-                use_cache=self.use_cached_outputs,
-            )
-            if request_output:
-                ctx.request_outputs.append(request_output)
-
-        # Immediately process request outputs here (if callback is given)
-        if (ctx.request_outputs
-                and self.process_request_outputs_callback is not None):
-            self.process_request_outputs_callback(ctx.request_outputs)
-            ctx.request_outputs.clear()
-
-        # For async case, we need to record the stats here.
-        # For non-async case, the stats are done in the
-        # LLMEngine/AsyncLLMEngine directly
-        if is_async:
-            # Log stats.
-            self.do_log_stats(scheduler_outputs, outputs, finished_before,
-                              skip)
-
-            # Tracing
-            self.do_tracing(scheduler_outputs, finished_before)
-
-        return None
-
-    def _advance_to_next_step(
-            self, output: SamplerOutput,
-            seq_group_metadata_list: List[SequenceGroupMetadata],
-            scheduled_seq_groups: List[ScheduledSequenceGroup]) -> None:
-        """Given model output from a single run, append the tokens to the
-        sequences. This is normally done inside output processor, but it is
-        required if the worker is to perform async forward pass to next step.
-        """
-        for seq_group_metadata, sequence_group_outputs, scheduled_seq_group in \
-            zip(seq_group_metadata_list, output, scheduled_seq_groups):
-            seq_group = scheduled_seq_group.seq_group
-
-            if seq_group.is_finished():
-                continue
-
-            token_chunk_size = (seq_group_metadata.token_chunk_size
-                                if seq_group_metadata.token_chunk_size
-                                is not None else 0)
-            seq_group.update_num_computed_tokens(token_chunk_size)
-
-            if seq_group_metadata.do_sample:
-                assert len(sequence_group_outputs.samples) == 1, (
-                    "Async output processor expects a single sample"
-                    " (i.e sampling_params.n == 1)")
-                sample = sequence_group_outputs.samples[0]
-
-                assert len(seq_group.seqs) == 1
-                seq = seq_group.seqs[0]
-
-                seq.append_token_id(sample.output_token, sample.logprobs,
-                                    sample.output_embed)
-
-    def step(self) -> List[RequestOutput]:
-        """Performs one decoding iteration and returns newly generated results.
-
-        <figure markdown="span">
-        ![Overview of the step function](https://i.imgur.com/sv2HssD.png)
-        <figcaption>Overview of the step function</figcaption>
-        </figure>
-
-        Details:
-        - Step 1: Schedules the sequences to be executed in the next
-            iteration and the token blocks to be swapped in/out/copy.
-
-            - Depending on the scheduling policy,
-                sequences may be `preempted/reordered`.
-            - A Sequence Group (SG) refer to a group of sequences
-                that are generated from the same prompt.
-
-        - Step 2: Calls the distributed executor to execute the model.
-        - Step 3: Processes the model output. This mainly includes:
-
-            - Decodes the relevant outputs.
-            - Updates the scheduled sequence groups with model outputs
-                based on its `sampling parameters` (`use_beam_search` or not).
-            - Frees the finished sequence groups.
-
-        - Finally, it creates and returns the newly generated results.
-
-        Example:
-        ```
-        # Please see the example/ folder for more detailed examples.
-
-        # initialize engine and request arguments
-        engine = LLMEngine.from_engine_args(engine_args)
-        example_inputs = [(0, "What is LLM?",
-        SamplingParams(temperature=0.0))]
-
-        # Start the engine with an event loop
-        while True:
-            if example_inputs:
-                req_id, prompt, sampling_params = example_inputs.pop(0)
-                engine.add_request(str(req_id),prompt,sampling_params)
-
-            # continue the request processing
-            request_outputs = engine.step()
-            for request_output in request_outputs:
-                if request_output.finished:
-                    # return or show the request output
-
-            if not (engine.has_unfinished_requests() or example_inputs):
-                break
-        ```
-        """
-        if self.parallel_config.pipeline_parallel_size > 1:
-            raise NotImplementedError(
-                "Pipeline parallelism is only supported through AsyncLLMEngine "
-                "as performance will be severely degraded otherwise.")
-
-        # For llm_engine, there is no pipeline parallel support, so the engine
-        # used is always 0.
-        virtual_engine = 0
-
-        # These are cached outputs from previous iterations. None if on first
-        # iteration
-        cached_outputs = self.cached_scheduler_outputs[virtual_engine]
-        seq_group_metadata_list = cached_outputs.seq_group_metadata_list
-        scheduler_outputs = cached_outputs.scheduler_outputs
-        allow_async_output_proc = cached_outputs.allow_async_output_proc
-
-        ctx = self.scheduler_contexts[virtual_engine]
-
-        # Clear outputs for each new scheduler iteration
-        ctx.request_outputs.clear()
-
-        # Skip the scheduler if there are any remaining steps in the seq groups.
-        # This ensures that the scheduler is only called again when the current
-        # batch has completed.
-        # The scheduler is also skipped if a single request caused the last
-        # engine step to fail, and the previous schedule needs to be rerun.
-        if not self._has_remaining_steps(
-                seq_group_metadata_list
-        ) and not self._skip_scheduling_next_step:
-            # Schedule iteration
-            (seq_group_metadata_list, scheduler_outputs,
-             allow_async_output_proc
-             ) = self.scheduler[virtual_engine].schedule()
-
-            ctx.seq_group_metadata_list = seq_group_metadata_list
-            ctx.scheduler_outputs = scheduler_outputs
-
-            finished_requests_ids = self.scheduler[
-                virtual_engine].get_and_reset_finished_requests_ids()
-            # When n>1, elements in self.seq_id_to_seq_group should be deleted
-            # here, otherwise memory leaks.
-            for finished_request_id in finished_requests_ids:
-                if finished_request_id in self.seq_id_to_seq_group:
-                    del self.seq_id_to_seq_group[finished_request_id]
-
-            # Maybe switch from async mode to sync mode
-            if not allow_async_output_proc and len(ctx.output_queue) > 0:
-                self._process_model_outputs(ctx=ctx)
-
-        else:
-            finished_requests_ids = list()
-
-        assert seq_group_metadata_list is not None
-        assert scheduler_outputs is not None
-
-        if not scheduler_outputs.is_empty():
-
-            # Check if we have a cached last_output from the previous iteration.
-            # For supporting PP this is probably the best way to pass the
-            # sampled_token_ids, as a separate broadcast over all the PP stages
-            # will cause one virtual engine's microbatch to block the pipeline.
-            last_sampled_token_ids = \
-                self._get_last_sampled_token_ids(virtual_engine)
-
-            execute_model_req = ExecuteModelRequest(
-                seq_group_metadata_list=seq_group_metadata_list,
-                blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
-                blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
-                blocks_to_copy=scheduler_outputs.blocks_to_copy,
-                num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
-                running_queue_size=scheduler_outputs.running_queue_size,
-                finished_requests_ids=finished_requests_ids,
-                # We use ExecuteModelRequest to pass the last sampled_token_ids
-                # to each of the non-last PP stages for in-place prepare_input.
-                last_sampled_token_ids=last_sampled_token_ids)
-
-            if allow_async_output_proc:
-                execute_model_req.async_callback = self.async_callbacks[
-                    virtual_engine]
-
-            try:
-                outputs = self.model_executor.execute_model(
-                    execute_model_req=execute_model_req)
-                self._skip_scheduling_next_step = False
-            except InputProcessingError as e:
-                # The input for this request cannot be processed, so we must
-                # abort it. If there are remaining requests in the batch that
-                # have been scheduled, they will be retried on the next step.
-                invalid_request_id = e.request_id
-                self._abort_and_cache_schedule(
-                    request_id=invalid_request_id,
-                    virtual_engine=virtual_engine,
-                    seq_group_metadata_list=seq_group_metadata_list,
-                    scheduler_outputs=scheduler_outputs,
-                    allow_async_output_proc=allow_async_output_proc)
-                # Raise so the caller is notified that this request failed
-                raise
-
-        else:
-            # Nothing scheduled => If there is pending async postprocessor,
-            # then finish it here.
-            if len(ctx.output_queue) > 0:
-                self._process_model_outputs(ctx=ctx)
-            # No outputs in this case
-            outputs = []
-
-        if not self._has_remaining_steps(seq_group_metadata_list):
-            # is_first_step_output is True only when the num_steps of all
-            # the sequences are 1.
-            is_first_step_output: bool = False if not seq_group_metadata_list \
-                else seq_group_metadata_list[0].state.num_steps == 1
-
-            # Add results to the output_queue
-            ctx.append_output(outputs=outputs,
-                              seq_group_metadata_list=seq_group_metadata_list,
-                              scheduler_outputs=scheduler_outputs,
-                              is_async=allow_async_output_proc,
-                              is_last_step=True,
-                              is_first_step_output=is_first_step_output)
-
-            if outputs and allow_async_output_proc:
-                assert len(outputs) == 1, (
-                    "Async postprocessor expects only a single output set")
-
-                self._advance_to_next_step(
-                    outputs[0], seq_group_metadata_list,
-                    scheduler_outputs.scheduled_seq_groups)
-
-            # Check if need to run the usual non-async path
-            if not allow_async_output_proc:
-                self._process_model_outputs(ctx=ctx)
-
-                # Log stats.
-                self.do_log_stats(scheduler_outputs, outputs)
-
-                # Tracing
-                self.do_tracing(scheduler_outputs)
-        else:
-            # Multi-step case
-            return ctx.request_outputs
-
-        if not self.has_unfinished_requests():
-            # Drain async postprocessor (if exists)
-            if len(ctx.output_queue) > 0:
-                self._process_model_outputs(ctx=ctx)
-            assert len(ctx.output_queue) == 0
-
-            # Stop the execute model loop in parallel workers until there are
-            # more requests to process. This avoids waiting indefinitely in
-            # torch.distributed ops which may otherwise time out, and unblocks
-            # the RPC thread in the workers so that they can process any other
-            # queued control plane messages, such as add/remove lora adapters.
-            logger.debug("Stopping remote worker execution loop.")
-            self.model_executor.stop_remote_worker_execution_loop()
-
-        return ctx.request_outputs
-
-    def _abort_and_cache_schedule(
-            self, request_id: str, virtual_engine: int,
-            seq_group_metadata_list: List[SequenceGroupMetadata],
-            scheduler_outputs: SchedulerOutputs,
-            allow_async_output_proc: bool) -> None:
-        """Aborts a single request, and caches the scheduler outputs minus that
-        request. This allows the next step to continue processing the remaining
-        requests without having to re-run the scheduler."""
-
-        # Abort the request and remove its sequence group from the current
-        # schedule
-        self.abort_request(request_id)
-        for i, metadata in enumerate(seq_group_metadata_list):
-            if metadata.request_id == request_id:
-                del seq_group_metadata_list[i]
-                break
-        for i, group in enumerate(scheduler_outputs.scheduled_seq_groups):
-            if group.seq_group.request_id == request_id:
-                del scheduler_outputs.scheduled_seq_groups[i]
-                break
-
-        # If there are still other sequence groups left in the schedule, cache
-        # them and flag the engine to reuse the schedule.
-        if len(seq_group_metadata_list) > 0:
-            self._skip_scheduling_next_step = True
-            # Reuse multi-step caching logic
-            self._cache_scheduler_outputs_for_multi_step(
-                virtual_engine=virtual_engine,
-                scheduler_outputs=scheduler_outputs,
-                seq_group_metadata_list=seq_group_metadata_list,
-                allow_async_output_proc=allow_async_output_proc)
-
-    def _has_remaining_steps(
-        self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]]
-    ) -> bool:
-        return False
-
-    def _cache_scheduler_outputs_for_multi_step(
-            self, virtual_engine: int,
-            seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-            scheduler_outputs: SchedulerOutputs,
-            allow_async_output_proc: bool) -> None:
-        co = self.cached_scheduler_outputs[virtual_engine]
-
-        co.seq_group_metadata_list = seq_group_metadata_list
-        co.scheduler_outputs = scheduler_outputs
-        co.allow_async_output_proc = allow_async_output_proc
-        co.last_output = None
-
-    def _update_cached_scheduler_output(
-            self, virtual_engine: int,
-            output: List[Optional[SamplerOutput]]) -> None:
-        if (self.parallel_config.pipeline_parallel_size > 1 and len(output) > 0
-                and output[0] is not None):
-            last_output = output[-1]
-            assert last_output is not None
-            assert last_output.sampled_token_ids_cpu is not None
-            assert last_output.sampled_token_ids is None
-            assert last_output.sampled_token_probs is None
-            self.cached_scheduler_outputs[
-                virtual_engine].last_output = last_output
-
-    def _get_last_sampled_token_ids(
-            self, virtual_engine: int) -> Optional[torch.Tensor]:
-        return None
-
-    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
-        if not self.log_stats:
-            raise RuntimeError(
-                "Stat logging is disabled. Set `disable_log_stats=False` "
-                "argument to enable.")
-        if logger_name in self.stat_loggers:
-            raise KeyError(f"Logger with name {logger_name} already exists.")
-        self.stat_loggers[logger_name] = logger
-
-    def remove_logger(self, logger_name: str) -> None:
-        if not self.log_stats:
-            raise RuntimeError(
-                "Stat logging is disabled. Set `disable_log_stats=False` "
-                "argument to enable.")
-        if logger_name not in self.stat_loggers:
-            raise KeyError(f"Logger with name {logger_name} does not exist.")
-        del self.stat_loggers[logger_name]
-
-    def do_log_stats(self,
-                     scheduler_outputs: Optional[SchedulerOutputs] = None,
-                     model_output: Optional[List[SamplerOutput]] = None,
-                     finished_before: Optional[List[int]] = None,
-                     skip: Optional[List[int]] = None) -> None:
-        """Forced log when no requests active."""
-        if self.log_stats:
-            stats = self._get_stats(scheduler_outputs, model_output,
-                                    finished_before, skip)
-            for logger in self.stat_loggers.values():
-                logger.log(stats)
-
-    def _get_stats(self,
-                   scheduler_outputs: Optional[SchedulerOutputs],
-                   model_output: Optional[List[SamplerOutput]] = None,
-                   finished_before: Optional[List[int]] = None,
-                   skip: Optional[List[int]] = None) -> Stats:
-        """Get Stats to be Logged to Prometheus.
-
-        Args:
-            scheduler_outputs: Optional, used to populate metrics related to
-                the scheduled batch,
-            model_output: Optional, used to emit speculative decoding metrics
-                which are created by the workers.
-            finished_before: Optional, indices of sequences that were finished
-                before. These sequences will be ignored.
-            skip: Optional, indices of sequences that were preempted. These
-                sequences will be ignored.
-        """
-        now = time.time()
-
-        # System State
-        #   Scheduler State
-        num_running_sys = sum(
-            len(scheduler.running) for scheduler in self.scheduler)
-        num_swapped_sys = sum(
-            len(scheduler.swapped) for scheduler in self.scheduler)
-        num_waiting_sys = sum(
-            len(scheduler.waiting) for scheduler in self.scheduler)
-
-        # KV Cache Usage in %
-        num_total_gpu = self.cache_config.num_gpu_blocks
-        gpu_cache_usage_sys = 0.
-        if num_total_gpu:  # Guard against both None and 0
-            num_free_gpu = sum(
-                scheduler.block_manager.get_num_free_gpu_blocks()
-                for scheduler in self.scheduler)
-            gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu)
-
-        num_total_cpu = self.cache_config.num_cpu_blocks
-        cpu_cache_usage_sys = 0.
-        if num_total_cpu:  # Guard against both None and 0
-            num_free_cpu = sum(
-                scheduler.block_manager.get_num_free_cpu_blocks()
-                for scheduler in self.scheduler)
-            cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu)
-
-        # Prefix Cache Hit Rate. Note that we always use
-        # the cache hit rate of the first virtual engine.
-        cpu_prefix_cache_hit_rate = self.scheduler[
-            0].get_prefix_cache_hit_rate(Device.CPU)
-        gpu_prefix_cache_hit_rate = self.scheduler[
-            0].get_prefix_cache_hit_rate(Device.GPU)
-
-        # Exchange the uasge and cache hit stats between gpu and cpu when
-        # running on cpu because the cpu_worker.py intentionally reports the
-        # number of cpu blocks as gpu blocks in favor of cache management.
-        if self.device_config.device_type == "cpu":
-            num_total_gpu, num_total_cpu = num_total_cpu, num_total_gpu
-            gpu_cache_usage_sys, cpu_cache_usage_sys = (
-                cpu_cache_usage_sys,
-                gpu_cache_usage_sys,
-            )
-            gpu_prefix_cache_hit_rate, cpu_prefix_cache_hit_rate = (
-                cpu_prefix_cache_hit_rate,
-                gpu_prefix_cache_hit_rate,
-            )
-
-        # Iteration stats
-        num_prompt_tokens_iter = 0
-        num_generation_tokens_iter = 0
-        num_tokens_iter = 0
-        time_to_first_tokens_iter: List[float] = []
-        inter_token_latencies_iter: List[float] = []
-        num_preemption_iter = (0 if scheduler_outputs is None else
-                               scheduler_outputs.preempted)
-
-        # Request stats
-        #   Latency
-        time_e2e_requests: List[float] = []
-        time_queue_requests: List[float] = []
-        time_inference_requests: List[float] = []
-        time_prefill_requests: List[float] = []
-        time_decode_requests: List[float] = []
-        #   Metadata
-        num_prompt_tokens_requests: List[int] = []
-        num_generation_tokens_requests: List[int] = []
-        n_requests: List[int] = []
-        max_num_generation_tokens_requests: List[int] = []
-        max_tokens_requests: List[int] = []
-        finished_reason_requests: List[str] = []
-
-        # LoRA requests
-        running_lora_adapters = dict(
-            collectionsCounter([
-                running_request.lora_request.lora_name
-                for scheduler in self.scheduler
-                for running_request in scheduler.running
-                if running_request.lora_request
-            ]))
-        waiting_lora_adapters = dict(
-            collectionsCounter([
-                waiting_request.lora_request.lora_name
-                for scheduler in self.scheduler
-                for waiting_request in scheduler.waiting
-                if waiting_request.lora_request
-            ]))
-        max_lora_stat = "0"
-        if self.lora_config:
-            max_lora_stat = str(self.lora_config.max_loras)
-
-        # NOTE: This loop assumes prefill seq_groups are before
-        # decode seq_groups in scheduled_seq_groups.
-        if scheduler_outputs is not None:
-            # For async postprocessor, already finished sequences need to be
-            # not counted (to avoid double counting)
-            actual_num_batched_tokens = scheduler_outputs.num_batched_tokens  # type: ignore
-
-            num_generation_tokens_from_prefill_groups = 0
-            # NOTE: if scheduler_outputs.num_prefill_groups > 0 and
-            # the len of scheduler_outputs.scheduled_seq_groups is !=
-            # scheduler_outputs.num_prefill_groups, this means that
-            # chunked prefills have been detected.
-
-            for idx, scheduled_seq_group in enumerate(
-                    scheduler_outputs.scheduled_seq_groups):
-                # Skip double logging when using async output proc
-                if finished_before and idx in finished_before:
-                    actual_num_batched_tokens -= 1
-                    continue
-
-                # Currently, skip == preempted sequences, so we need to skip
-                # their log stats
-                if skip and idx in skip:
-                    continue
-
-                group_was_prefill = idx < scheduler_outputs.num_prefill_groups
-                seq_group = scheduled_seq_group.seq_group
-
-                # NOTE: a seq_group that completed all of its prefill tokens
-                # in the last iteration will have seq_group.is_prefill() = False
-                # with group_was_prefill = True
-                if group_was_prefill:
-                    # Number of prompt tokens.
-                    num_prompt_tokens_iter += (
-                        scheduled_seq_group.token_chunk_size)
-
-                    # If the seq_group just finished the prefill state
-                    # get TTFT.
-                    if not seq_group.is_prefill():
-                        latency = seq_group.get_last_token_latency()
-                        time_to_first_tokens_iter.append(latency)
-
-                        # One generation token per finished prefill.
-                        num_generation_tokens_from_prefill_groups += (
-                            seq_group.num_seqs())
-                else:
-                    # ITLs
-                    latency = seq_group.get_last_token_latency()
-                    inter_token_latencies_iter.append(latency)
-                    if seq_group.state.current_step == 0:
-                        # For async_output_proc, the do_log_stats()
-                        # is called following init_multi_step(), which
-                        # sets the current_step to zero.
-                        actual_num_batched_tokens +=\
-                            seq_group.state.num_steps - 1
-                    else:
-                        actual_num_batched_tokens +=\
-                            seq_group.state.current_step - 1
-
-                # Because of chunked prefill, we can have a single sequence
-                # group that does multiple prompt_runs. To prevent logging
-                # the same metadata more than once per request, we standardize
-                # on logging request level information for finished requests,
-                # which can only happen once.
-                if seq_group.is_finished():
-                    # Latency timings
-                    time_e2e_requests.append(now -
-                                             seq_group.metrics.arrival_time)
-                    if (seq_group.metrics.first_scheduled_time is not None and
-                            seq_group.metrics.first_token_time is not None):
-                        time_queue_requests.append(
-                            seq_group.metrics.first_scheduled_time -
-                            seq_group.metrics.arrival_time)
-                        time_prefill_requests.append(
-                            seq_group.metrics.first_token_time -
-                            seq_group.metrics.first_scheduled_time)
-                        time_decode_requests.append(
-                            now - seq_group.metrics.first_token_time)
-                        time_inference_requests.append(
-                            now - seq_group.metrics.first_scheduled_time)
-                    # Metadata
-                    num_prompt_tokens_requests.append(
-                        len(seq_group.prompt_token_ids))
-                    num_generation_tokens_requests.extend([
-                        seq.get_output_len()
-                        for seq in seq_group.get_finished_seqs()
-                    ])
-                    max_num_generation_tokens_requests.append(
-                        max(seq.get_output_len()
-                            for seq in seq_group.get_seqs()))
-                    if seq_group.sampling_params is not None:
-                        n_requests.append(seq_group.sampling_params.n)
-                        max_tokens_requests.append(
-                            seq_group.sampling_params.max_tokens)
-                    finished_reason_requests.extend([
-                        SequenceStatus.get_finished_reason(seq.status)
-                        for seq in seq_group.get_finished_seqs()
-                    ])
-
-            # Number of generation tokens.
-            #   num_batched_tokens equals the number of prompt_tokens plus the
-            #   number of decode_tokens in a single iteration. So,
-            #   num_generation_tokens = num_batched_tokens - num_prompt_tokens
-            #   + num_generation_tokens_from_prefill_groups (since we generate
-            #   one token on prefills on iters where the prefill finishes).
-            num_generation_tokens_iter = (
-                actual_num_batched_tokens - num_prompt_tokens_iter +
-                num_generation_tokens_from_prefill_groups)
-            num_tokens_iter = (num_generation_tokens_iter +
-                               num_prompt_tokens_iter)
-
-        return Stats(
-            now=now,
-            # System stats
-            #   Scheduler State
-            num_running_sys=num_running_sys,
-            num_swapped_sys=num_swapped_sys,
-            num_waiting_sys=num_waiting_sys,
-            #   KV Cache Usage in %
-            gpu_cache_usage_sys=gpu_cache_usage_sys,
-            cpu_cache_usage_sys=cpu_cache_usage_sys,
-            #   Prefix Cache Hit Rate
-            cpu_prefix_cache_hit_rate=cpu_prefix_cache_hit_rate,
-            gpu_prefix_cache_hit_rate=gpu_prefix_cache_hit_rate,
-
-            # Iteration stats
-            num_prompt_tokens_iter=num_prompt_tokens_iter,
-            num_generation_tokens_iter=num_generation_tokens_iter,
-            num_tokens_iter=num_tokens_iter,
-            time_to_first_tokens_iter=time_to_first_tokens_iter,
-            inter_token_latencies_iter=inter_token_latencies_iter,
-            num_preemption_iter=num_preemption_iter,
-
-            # Request stats
-            #   Latency
-            time_e2e_requests=time_e2e_requests,
-            time_queue_requests=time_queue_requests,
-            time_inference_requests=time_inference_requests,
-            time_prefill_requests=time_prefill_requests,
-            time_decode_requests=time_decode_requests,
-            #   Metadata
-            num_prompt_tokens_requests=num_prompt_tokens_requests,
-            num_generation_tokens_requests=num_generation_tokens_requests,
-            max_num_generation_tokens_requests=
-            max_num_generation_tokens_requests,
-            n_requests=n_requests,
-            max_tokens_requests=max_tokens_requests,
-            finished_reason_requests=finished_reason_requests,
-            max_lora=str(max_lora_stat),
-            waiting_lora_adapters=list(waiting_lora_adapters.keys()),
-            running_lora_adapters=list(running_lora_adapters.keys()))
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.model_executor.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        return self.model_executor.remove_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        return self.model_executor.list_loras()
-
-    def pin_lora(self, lora_id: int) -> bool:
-        return self.model_executor.pin_lora(lora_id)
-
-    def start_profile(self) -> None:
-        self.model_executor.start_profile()
-
-    def stop_profile(self) -> None:
-        self.model_executor.stop_profile()
-
-    def sleep(self, level: int = 1) -> None:
-        assert self.vllm_config.model_config.enable_sleep_mode, (
-            "Sleep mode is not enabled in the model config")
-        self.model_executor.sleep(level=level)
-
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
-        assert self.vllm_config.model_config.enable_sleep_mode, (
-            "Sleep mode is not enabled in the model config")
-        self.model_executor.wake_up(tags)
-
-    def is_sleeping(self) -> bool:
-        return self.model_executor.is_sleeping
-
-    def check_health(self) -> None:
-        self.model_executor.check_health()
-
-    def is_tracing_enabled(self) -> bool:
-        return self.tracer is not None
-
-    def do_tracing(self,
-                   scheduler_outputs: SchedulerOutputs,
-                   finished_before: Optional[List[int]] = None) -> None:
-        if self.tracer is None:
-            return
-
-        for idx, scheduled_seq_group in enumerate(
-                scheduler_outputs.scheduled_seq_groups):
-            # Skip double tracing when using async output proc
-            if finished_before and idx in finished_before:
-                continue
-
-            seq_group = scheduled_seq_group.seq_group
-            if seq_group.is_finished():
-                self.create_trace_span(seq_group)
-
-    def create_trace_span(self, seq_group: SequenceGroup) -> None:
-        if self.tracer is None or seq_group.sampling_params is None:
-            return
-        arrival_time_nano_seconds = int(seq_group.metrics.arrival_time * 1e9)
-
-        trace_context = extract_trace_context(seq_group.trace_headers)
-
-        with self.tracer.start_as_current_span(
-                "llm_request",
-                kind=SpanKind.SERVER,
-                context=trace_context,
-                start_time=arrival_time_nano_seconds) as seq_span:
-            metrics = seq_group.metrics
-
-            # Handle potential None values for cancelled/aborted requests
-            ttft = (metrics.first_token_time - metrics.arrival_time
-                    if metrics.first_token_time is not None else None)
-
-            e2e_time = (metrics.finished_time - metrics.arrival_time
-                        if metrics.finished_time is not None else None)
-
-            seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL,
-                                   self.model_config.model)
-            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
-                                   seq_group.request_id)
-            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
-                                   seq_group.sampling_params.temperature)
-            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P,
-                                   seq_group.sampling_params.top_p)
-            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
-                                   seq_group.sampling_params.max_tokens)
-            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N,
-                                   seq_group.sampling_params.n)
-            seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_NUM_SEQUENCES,
-                                   seq_group.num_seqs())
-            seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
-                                   len(seq_group.prompt_token_ids))
-            seq_span.set_attribute(
-                SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
-                sum([
-                    seq.get_output_len()
-                    for seq in seq_group.get_finished_seqs()
-                ]))
-
-            # Only set timing attributes if the values are available
-            if metrics.time_in_queue is not None:
-                seq_span.set_attribute(
-                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
-                    metrics.time_in_queue)
-            if ttft is not None:
-                seq_span.set_attribute(
-                    SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
-            if e2e_time is not None:
-                seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E,
-                                       e2e_time)
-            if metrics.scheduler_time is not None:
-                seq_span.set_attribute(
-                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER,
-                    metrics.scheduler_time)
-            if metrics.model_forward_time is not None:
-                seq_span.set_attribute(
-                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD,
-                    metrics.model_forward_time / 1000.0)
-            if metrics.model_execute_time is not None:
-                seq_span.set_attribute(
-                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE,
-                    metrics.model_execute_time)
-
-    def _validate_model_inputs(self, inputs: ProcessorInputs):
-        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
-
-        if encoder_inputs is not None:
-            self._validate_model_input(encoder_inputs, prompt_type="encoder")
-
-        self._validate_model_input(decoder_inputs, prompt_type="decoder")
-
-    def _validate_model_input(
-        self,
-        prompt_inputs: SingletonInputs,
-        *,
-        prompt_type: Literal["encoder", "decoder"],
-    ):
-        model_config = self.model_config
-        tokenizer = self.tokenizer
-
-        prompt_ids = prompt_inputs.get("prompt_token_ids", [])
-        if not prompt_ids:
-            if prompt_type == "encoder" and model_config.is_multimodal_model:
-                pass  # Mllama may have empty encoder inputs for text-only data
-            elif prompt_inputs["type"] == "embeds":
-                pass
-            else:
-                raise ValueError(f"The {prompt_type} prompt cannot be empty")
-
-        if tokenizer is not None:
-            max_input_id = max(prompt_ids, default=0)
-            if max_input_id > tokenizer.max_token_id:
-                raise ValueError(
-                    f"Token id {max_input_id} is out of vocabulary")
-
-        max_prompt_len = self.model_config.max_model_len
-        if len(prompt_ids) > max_prompt_len:
-            if prompt_type == "encoder" and model_config.is_multimodal_model:
-                mm_registry = self.input_preprocessor.mm_registry
-                mm_processor = mm_registry.create_processor(
-                    model_config,
-                    tokenizer=tokenizer or object(),  # Dummy if no tokenizer
-                )
-                assert isinstance(mm_processor, EncDecMultiModalProcessor)
-
-                if mm_processor.pad_dummy_encoder_prompt:
-                    return  # Skip encoder length check for Whisper
-
-            if model_config.is_multimodal_model:
-                suggestion = (
-                    "Make sure that `max_model_len` is no smaller than the "
-                    "number of text tokens plus multimodal tokens. For image "
-                    "inputs, the number of image tokens depends on the number "
-                    "of images, and possibly their aspect ratios as well.")
-            else:
-                suggestion = (
-                    "Make sure that `max_model_len` is no smaller than the "
-                    "number of text tokens.")
-
-            raise ValueError(
-                f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
-                f"longer than the maximum model length of {max_prompt_len}. "
-                f"{suggestion}")
-
-            # TODO: Find out how many placeholder tokens are there so we can
-            # check that chunked prefill does not truncate them
-            # max_batch_len = self.scheduler_config.max_num_batched_tokens
-
-    def _build_logits_processors(
-            self, sampling_params: SamplingParams,
-            lora_request: Optional[LoRARequest]) -> SamplingParams:
-        """Constructs logits processors based on the logits_bias, and
-        allowed_token_ids fields in sampling_params. Deletes those fields and
-        adds the constructed logits processors to the logits_processors field.
-        Returns the modified sampling params."""
-
-        logits_processors = []
-
-        if (sampling_params.logit_bias or sampling_params.allowed_token_ids):
-            tokenizer = self.get_tokenizer()
-
-            processors = get_openai_logits_processors(
-                logit_bias=sampling_params.logit_bias,
-                allowed_token_ids=sampling_params.allowed_token_ids,
-                tokenizer=tokenizer)
-            logits_processors.extend(processors)
-
-            # Unset so these don't get passed down to the model
-            sampling_params.logit_bias = None
-            sampling_params.allowed_token_ids = None
-
-        if len(sampling_params.bad_words) > 0:
-            tokenizer = self.get_tokenizer()
-            processors = get_bad_words_logits_processors(
-                bad_words=sampling_params.bad_words, tokenizer=tokenizer)
-            logits_processors.extend(processors)
-
-        if logits_processors:
-            if sampling_params.logits_processors is None:
-                sampling_params.logits_processors = logits_processors
-            else:
-                sampling_params.logits_processors.extend(logits_processors)
-
-        return sampling_params
-
-    def collective_rpc(self,
-                       method: Union[str, Callable[[WorkerBase], _R]],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
-        return self.model_executor.collective_rpc(method, timeout, args,
-                                                  kwargs)
-
-    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
-        return self.collective_rpc("apply_model", args=(func, ))
-
-
-if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
-    from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
-    LLMEngine = V1LLMEngine  # type: ignore
+LLMEngine = V1LLMEngine  # type: ignore
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index f2282c40f7073..0ab806fcb8b5c 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -11,7 +11,6 @@ from pydantic import ValidationError
 from tqdm.auto import tqdm
 from typing_extensions import TypeVar
 
-import vllm.envs as envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                               BeamSearchSequence,
                               create_sort_beams_key_function)
@@ -19,7 +18,6 @@ from vllm.config import (CompilationConfig, ModelDType,
                          StructuredOutputsConfig, TokenizerMode, is_init_field)
 from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
                                    PoolerConfig, RunnerOption)
-from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                          ChatTemplateContentFormatOption,
                                          apply_hf_chat_template,
@@ -54,6 +52,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, Device, as_iter, is_list_of
+from vllm.v1.engine.llm_engine import LLMEngine
 from vllm.v1.sample.logits_processor import LogitsProcessor
 
 if TYPE_CHECKING:
@@ -309,11 +308,7 @@ class LLM:
         self.request_counter = Counter()
         self.default_sampling_params: Union[dict[str, Any], None] = None
 
-        if envs.VLLM_USE_V1:
-            supported_tasks = self.llm_engine \
-                .get_supported_tasks()  # type: ignore
-        else:
-            supported_tasks = self.llm_engine.model_config.supported_tasks
+        supported_tasks = self.llm_engine.get_supported_tasks()  # type: ignore
 
         logger.info("Supported_tasks: %s", supported_tasks)
 
@@ -1473,8 +1468,6 @@ class LLM:
         Note:
             This method is only available with the V1 LLM engine.
         """
-        from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
-        assert isinstance(self.llm_engine, V1LLMEngine)
         return self.llm_engine.get_metrics()
 
     def _validate_and_add_requests(
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 58296131fadb9..13f4eebf1038e 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -672,21 +672,15 @@ def tensorize_vllm_model(engine_args: "EngineArgs",
         ) as stream:
             stream.write(encryption_params.key)
 
-    from vllm import LLMEngine
-    from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+    assert envs.VLLM_USE_V1
 
-    if not envs.VLLM_USE_V1:
-        engine = LLMEngine.from_engine_args(engine_args)
-        engine.model_executor.collective_rpc(
-            "save_tensorized_model",
-            kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
-        )
-    else:
-        engine = V1LLMEngine.from_vllm_config(engine_config)
-        engine.collective_rpc(
-            "save_tensorized_model",
-            kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
-        )
+    from vllm.v1.engine.llm_engine import LLMEngine
+
+    engine = LLMEngine.from_vllm_config(engine_config)
+    engine.collective_rpc(
+        "save_tensorized_model",
+        kwargs={"tensorizer_config": tensorizer_config.to_serializable()},
+    )
 
 
 def tensorize_lora_adapter(lora_path: str,