Merge branch 'main' into woosuk/model-runner-v2

2026-07-08 06:57:09 +08:00 · 2025-09-21 11:25:18 -07:00 · 2025-09-21 11:25:18 -07:00 · 631b5b47c1
commit 631b5b47c1
parent 42ffdd9179 1c3ffdbecc
272 changed files with 1371 additions and 18894 deletions
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi

-if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
-  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
-fi
-
 if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -6,24 +6,28 @@
 # to generate the final pipeline yaml file.

 # Documentation
-# label(str): the name of the test. emoji allowed.
-# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
-# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
-# fast_check_only(bool): run this test on fastcheck pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
+# label(str): the name of the test. emojis allowed.
+# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
+# fast_check_only(bool): run this test on the fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
+# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
 # command(str): the single command to run for tests. incompatible with commands.
-# commands(list): the list of commands to run for test. incompatbile with command.
-# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
-# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
-# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
-# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
-#     in this case, commands must be specified. the first command runs on first host, the second
+# commands(list): the list of commands to run for the test. incompatible with command.
+# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
+# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
+# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
+#     in this case, commands must be specified. the first command runs on the first host, the second
 #     command runs on the second host.
-# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
-# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
+# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
+# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
+#     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
+# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
+# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.

 # When adding a test
-# - If the test belong to an existing group, add it there
+# - If the test belongs to an existing group, add it there
 # - If the test is short, add to any existing step
 # - If the test takes more than 10min, then it is okay to create a new step.
 #   Note that all steps execute in parallel.
@ -110,7 +114,7 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

 - label: Entrypoints Integration Test (API Server) # 100min
  timeout_in_minutes: 130
@ -148,7 +152,6 @@ steps:
  num_gpus: 4
  source_file_dependencies:
  - vllm/distributed/
-  - vllm/core/
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
@ -163,7 +166,6 @@ steps:
  - tests/v1/engine/test_engine_core_client.py
  commands:
  # test with tp=2 and external_dp=2
-  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with tp=2 and pp=2
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@ -314,12 +316,11 @@ steps:
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_pooling.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
-    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2

 - label: Platform Tests (CUDA) # 4min
  timeout_in_minutes: 15
@ -869,8 +870,6 @@ steps:
  - tests/distributed/
  - vllm/compilation
  - vllm/worker/worker_base.py
-  - vllm/worker/worker.py
-  - vllm/worker/model_runner.py
  - entrypoints/llm/test_collective_rpc.py
  - tests/v1/test_async_llm_dp.py
  - tests/v1/test_external_lb_dp.py
@ -894,7 +893,7 @@ steps:
  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
  # TODO: investigate and fix
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s models/multimodal/generation/test_maverick.py

--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -4,11 +4,8 @@
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/fused_moe @mgoin
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@ -1,510 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import inspect
-import json
-import os
-import sys
-from argparse import RawTextHelpFormatter
-from collections.abc import Generator
-from dataclasses import asdict, dataclass
-from typing import Any, Optional, TypeAlias
-
-import torch
-import tqdm
-
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.profiler.layerwise_profile import layerwise_profile
-from vllm.utils import FlexibleArgumentParser
-
-BATCH_SIZE_DEFAULT = 1
-PROMPT_LEN_DEFAULT = 256
-
-
-@dataclass
-class ProfileContext:
-    engine_args: EngineArgs
-    prompt_len: int
-    batch_size: int
-
-    # The profiler can run in 2 modes,
-    # 1. Run profiler for user specified num_steps
-    num_steps: Optional[int] = None
-    # 2. Run profiler until all requests complete
-    complete_num_requests_per_step: Optional[int] = None
-
-    save_chrome_traces_folder: Optional[str] = None
-
-
-def get_dtype(dtype: str):
-    if dtype == "torch.float":
-        return torch.float
-    else:
-        return dtype
-
-
-OutputLen_NumReqs_Map: TypeAlias = dict[int, int]
-
-
-def compute_request_output_lengths(
-    batch_size: int, step_requests: list[int]
-) -> OutputLen_NumReqs_Map:
-    """
-    Given the number of requests, batch_size, and the number of requests
-    that each engine-step should process, step_requests, determine the
-    output lengths of the requests such that step_request is honoured.
-
-    Example:
-    if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
-    then return,
-    {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
-    32 requests should have output length 2,
-    32 requests should have output length 3,
-    32 requests should have output length 4,
-    31 requests should have output length 5,
-    1 request should have output length 6.
-
-    Args:
-        batch_size (int): Number of requests submitted for profile. This is
-            args.batch_size.
-        step_requests (list[int]): step_requests[i] is the number of requests
-            that the ith engine step should process.
-
-    Returns:
-        OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
-            number of requests required to have that output-length as values.
-    """
-    ol_nr: OutputLen_NumReqs_Map = {}
-
-    # Number of request that are assigned an output-length
-    num_reqs_assigned: int = 0
-    num_steps: int = len(step_requests)
-
-    # sanity check. The first step (prefill-step), must process all requests.
-    assert step_requests[0] == batch_size
-
-    # Begin assignments from the last step.
-    output_length: int = num_steps
-    for num_requests_at_step in reversed(step_requests):
-        if num_reqs_assigned == batch_size:
-            break
-
-        assert num_reqs_assigned < batch_size
-
-        # Remove the number of requests that have been determined
-        # to participate in this step and beyond.
-        num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
-        assert num_reqs_unassigned_at_step >= 0
-
-        if num_reqs_unassigned_at_step > 0:
-            ol_nr[output_length] = num_reqs_unassigned_at_step
-            num_reqs_assigned += num_reqs_unassigned_at_step
-
-        output_length -= 1
-
-    # sanity checks.
-    assert sum(ol_nr.values()) == batch_size, (
-        "Number of requests in output-length assignment does not match "
-        f"batch-size.\n batch size {batch_size} - "
-        f"step requests {step_requests} - assignments {ol_nr}"
-    )
-
-    # Check that the output-length is in [1, num-steps]. Output length must be
-    # at least 1 as all requests must participate in the prefill-step.
-    assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), (
-        "Output lengths of requests should be in range "
-        f"[1, num-engine-steps].\n batch size {batch_size} - "
-        f"step requests {step_requests} - assignments {ol_nr}"
-    )
-
-    return ol_nr
-
-
-def determine_requests_per_step(context: ProfileContext) -> list[int]:
-    """
-    Determine number of requests each engine step should process.
-    If context.num_steps is set, then all engine steps process the
-    same number of requests and the output list is of length
-    context.num_steps.
-
-    If context.complete_num_requests_per_step is set, then each decode step
-    processes fewer and fewer requests until there are no requests to process.
-    In this case, the output list is as big as the number of steps
-    required to process all requests.
-
-    Args:
-        context: ProfileContext object.
-
-    Returns:
-        list[int]: Number of requests to process for all engine-steps.
-         output[i], contains the number of requests that the ith step
-         should process.
-    """
-    if context.num_steps:
-        # All requests must run until num_engine_steps. This implies
-        # that their output lengths must be equal to num_engine_steps.
-        return [context.batch_size] * context.num_steps
-
-    assert (
-        context.complete_num_requests_per_step
-        and context.complete_num_requests_per_step > 0
-    ), (
-        f"Expected a positive complete_num_requests_per_step argument."
-        f"Instead got {context.complete_num_requests_per_step}"
-    )
-
-    # We start dropping after the first decode step.
-    step_requests = [
-        context.batch_size,  # prefill
-        context.batch_size,  # decode
-    ]
-
-    num_running_requests = context.batch_size
-    num_running_requests -= context.complete_num_requests_per_step
-    while num_running_requests > 0:
-        step_requests.append(num_running_requests)
-        num_running_requests -= context.complete_num_requests_per_step
-
-    if step_requests[-1] != 1:
-        # have 1 request running at the last step. This is often
-        # useful
-        step_requests.append(1)
-
-    return step_requests
-
-
-def run_profile(
-    context: ProfileContext, csv_output: Optional[str], json_output: Optional[str]
-):
-    print("Run profile with:")
-    for key, value in asdict(context).items():
-        print(f"  {key} = {value}")
-
-    requests_per_step: list[int] = determine_requests_per_step(context)
-
-    ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
-        context.batch_size, requests_per_step
-    )
-
-    num_steps_to_profile: int = len(requests_per_step)
-    max_output_len: int = max(ol_nr.keys())
-    assert max_output_len >= 1
-
-    # Create sampling params
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        # max_tokens is set on a per-request basis.
-        max_tokens=None,
-        ignore_eos=True,
-    )
-
-    # Create LLM
-    llm = LLM(**asdict(context.engine_args))
-    batch_size = context.batch_size
-    prompt_len = context.prompt_len
-
-    scheduler_config = llm.llm_engine.vllm_config.scheduler_config
-    max_model_len = llm.llm_engine.model_config.max_model_len
-    max_num_batched_tokens = scheduler_config.max_num_batched_tokens
-    max_num_seqs = scheduler_config.max_num_seqs
-
-    if batch_size * prompt_len > max_num_batched_tokens:
-        print(
-            f"ERROR: chosen batch_size * prompt_len "
-            f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is  "
-            f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
-            f"and therefore cannot be run in a single profile step, please "
-            f"choose a smaller batch size or prompt length, or increase "
-            f"--max-num-batched-tokens"
-        )
-        sys.exit(-1)
-    if batch_size > max_num_seqs:
-        print(
-            f"ERROR: chosen batch_size ({batch_size}) is larger than "
-            f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
-            f"single profile step, please choose a smaller batch size"
-        )
-        sys.exit(-1)
-    print(
-        "llm.llm_engine.model_config.max_model_len: ",
-        llm.llm_engine.model_config.max_model_len,
-    )
-    if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
-        print(
-            f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
-            f"{max_output_len} = {prompt_len + max_output_len}) is larger "
-            f"than the model's max_model_len ({max_model_len}), please "
-            f"choose a smaller prompt_len or max_output_len, or increase "
-            f"--max-model-len"
-        )
-        sys.exit(-1)
-
-    def add_requests():
-        def get_output_len_generator() -> Generator[int, Any, Any]:
-            for output_len, num_reqs in ol_nr.items():
-                for _ in range(num_reqs):
-                    yield output_len
-
-        output_len_generator = get_output_len_generator()
-        for i in range(batch_size):
-            sampling_params.max_tokens = next(output_len_generator)
-            assert isinstance(sampling_params.max_tokens, int)
-
-            prompt_token_ids = torch.randint(
-                llm.get_tokenizer().vocab_size, size=(prompt_len,)
-            ).tolist()
-
-            llm.llm_engine.add_request(
-                request_id=f"seq{i}",
-                prompt={"prompt_token_ids": prompt_token_ids},
-                params=sampling_params,
-            )
-
-    def abort_requests():
-        for i in range(batch_size):
-            llm.llm_engine.abort_request(f"seq{i}")
-
-    # Warm up run
-    print("Warm up run ...")
-    add_requests()
-    llm.llm_engine.step()  # Prefill
-    llm.llm_engine.step()  # Decode
-    abort_requests()
-
-    print("Profile run ...")
-    add_requests()
-
-    with layerwise_profile() as prefill_prof:
-        llm.llm_engine.step()  # First step is prefill
-
-    decode_profs = []
-    for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
-        num_running_seqs = llm.llm_engine.scheduler[0].get_num_unfinished_seq_groups()
-        with layerwise_profile(num_running_seqs=num_running_seqs) as decode_prof:
-            llm.llm_engine.step()
-        decode_profs.append(decode_prof)
-
-    decode_results_list = [prof.results for prof in decode_profs]
-    prefill_results = prefill_prof.results
-    has_decode = len(decode_results_list) > 0
-
-    LINE_WIDTH = 80
-    print("=" * LINE_WIDTH)
-    print(f"= Prefill Model Table (prompt_len={prompt_len}, batch_size={batch_size})")
-    print("=" * LINE_WIDTH)
-    print()
-    prefill_results.print_model_table()
-
-    if has_decode:
-        print()
-        print("=" * LINE_WIDTH)
-        print(
-            f"= First Decode Step Model Table "
-            f"(prompt_len={prompt_len}, batch_size={batch_size})"
-        )
-        print("=" * LINE_WIDTH)
-        print()
-        decode_results_list[0].print_model_table()
-
-    print()
-    print("=" * LINE_WIDTH)
-    print(f"= Prefill Summary Table (prompt_len={prompt_len}, batch_size={batch_size})")
-    print("=" * LINE_WIDTH)
-    print()
-    prefill_results.print_summary_table()
-
-    if has_decode:
-        print()
-        print("=" * LINE_WIDTH)
-        print(
-            f"= First Decode Step Summary Table "
-            f"(prompt_len={prompt_len}, batch_size={batch_size})"
-        )
-        print("=" * LINE_WIDTH)
-        print()
-        decode_results_list[0].print_summary_table()
-
-    if csv_output:
-        csv_filename_base = (
-            csv_output[:-4] if csv_output.endswith(".csv") else csv_output
-        )
-        prefill_results.export_model_stats_table_csv(
-            csv_filename_base + "_prefill_model_table.csv"
-        )
-        prefill_results.export_summary_stats_table_csv(
-            csv_filename_base + "_prefill_summary_table.csv"
-        )
-
-        if has_decode:
-            decode_results_list[0].export_model_stats_table_csv(
-                csv_filename_base + "_decode_model_table.csv"
-            )
-            decode_results_list[0].export_summary_stats_table_csv(
-                csv_filename_base + "_decode_summary_table.csv"
-            )
-
-    if json_output:
-        cuda_devices = [
-            torch.cuda.get_device_properties(dev_idx)
-            for dev_idx in range(torch.cuda.device_count())
-        ]
-
-        json_dict = {
-            "context": {
-                "python_version": f"{sys.version}",
-                "torch_version": f"{torch.__version__}",
-                "torch_cuda_version": f"{torch.version.cuda}",
-                "cuda_devices": f"{cuda_devices}",
-                **asdict(context),
-            },
-            "prefill": prefill_results.convert_stats_to_dict(),
-        }
-
-        if has_decode:
-            for idx, dr in enumerate(decode_results_list):
-                json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
-
-        # Add .json to json_output filename if it doesn't exist already.
-        json_output_file = (
-            json_output if json_output.endswith(".json") else json_output + ".json"
-        )
-        with open(json_output_file, "w+") as f:
-            json.dump(json_dict, f, indent=2)
-        pass
-
-    if context.save_chrome_traces_folder is not None:
-        os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
-        prefill_prof.profiler.export_chrome_trace(
-            context.save_chrome_traces_folder + "/prefill.json"
-        )
-        for idx, decode_prof in enumerate(decode_profs):
-            decode_prof.profiler.export_chrome_trace(
-                context.save_chrome_traces_folder + f"/decode_{idx + 1}.json"
-            )
-        print(
-            "Traces saved as prefill.json and decode_1.json, etc."
-            f" in folder {context.save_chrome_traces_folder}"
-        )
-
-
-def parse_args():
-    parser = FlexibleArgumentParser(
-        description="""
-Profile a model
-
-    example:
-    ```
-    python examples/offline_inference/profiling.py \\
-        --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
-        --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
-        --enforce-eager run_num_steps -n 2
-    ```
-
-    then you can use various tools to analyze the json output
-    terminal ascii tables:
-        ```
-        python tools/profiler/print_layerwise_table.py \\
-            --json-trace Llama31-8b-FP8.json --phase prefill --table summary
-        ```
-    or create matplotlib stacked bar charts:
-        ```
-        python tools/profiler/visualize_layerwise_profile.py \\
-            --json-trace Llama31-8b-FP8.json \\
-            --output-directory profile_breakdown --plot-metric pct_cuda_time
-        ```
-""",
-        formatter_class=RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--csv",
-        type=str,
-        default=None,
-        help="Export the results as multiple csv file. This should be the root "
-        "filename, will create <filename>_prefill_model_table.csv, "
-        "<filename>_prefill_summary_table.csv, "
-        "<filename>_decode_model_table.csv, and "
-        "<filename>_decode_summary_table.csv",
-    )
-    parser.add_argument(
-        "--json",
-        type=str,
-        default=None,
-        help="Export the results as a json file. This should be the filename",
-    )
-    parser.add_argument(
-        "--save-chrome-traces-folder",
-        type=str,
-        help="Save chrome traces for the prefill and decode "
-        "will save traces as prefill.json and decode_1.json, "
-        "etc. inside this folder",
-    )
-    parser.add_argument(
-        "--prompt-len",
-        type=int,
-        default=PROMPT_LEN_DEFAULT,
-        help=f"Length of the random prompt to use when profiling, all batched "
-        f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}",
-    )
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=BATCH_SIZE_DEFAULT,
-        help=f"Number of requests to run as a single batch, "
-        f"default={BATCH_SIZE_DEFAULT}",
-    )
-
-    subparsers = parser.add_subparsers(dest="cmd")
-
-    run_num_steps_parser = subparsers.add_parser(
-        "run_num_steps", help="This variation profiles n engine.step() invocations."
-    )
-    run_num_steps_parser.add_argument(
-        "-n",
-        "--num-steps",
-        type=int,
-        help="Number of engine steps to profile.\n"
-        "Setting it to 1, profiles only the prefill step.\n"
-        "Setting it to 2, profiles the prefill and first decode step\n"
-        "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
-        "and so on ...",
-    )
-
-    run_to_completion_parser = subparsers.add_parser(
-        "run_to_completion",
-        help="This variation profiles all the engine.step() invocations"
-        "until the engine exhausts all submitted requests.",
-    )
-    run_to_completion_parser.add_argument(
-        "-n",
-        "--complete-num-requests-per-step",
-        type=int,
-        help="Complete complete_num_requests_per_step requests every decode step."
-        "For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
-        "the profiler is run for 6 engine steps, with the steps processing, "
-        "128, 128, 96, 64, 32, 1 requests respectively.\n"
-        "Note that we tack-on a one-request step at the end as it is often "
-        "useful.",
-    )
-
-    EngineArgs.add_cli_args(parser)
-
-    return parser.parse_args()
-
-
-def main(args):
-    context = ProfileContext(
-        engine_args=EngineArgs.from_cli_args(args),
-        **{
-            k: v
-            for k, v in vars(args).items()
-            if k in inspect.signature(ProfileContext).parameters
-        },
-    )
-    run_profile(context, csv_output=args.csv, json_output=args.json)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@ -102,6 +102,7 @@ plugins:
          - https://numpy.org/doc/stable/objects.inv
          - https://pytorch.org/docs/stable/objects.inv
          - https://psutil.readthedocs.io/en/stable/objects.inv
+          - https://huggingface.co/docs/transformers/main/en/objects.inv

 markdown_extensions:
  - attr_list
--- a/pyproject.toml
+++ b/pyproject.toml
@ -70,7 +70,6 @@ line-length = 80
 "vllm/_version.py" = ["ALL"]
 # Python 3.8 typing - skip V0 code
 "vllm/attention/**/*.py" = ["UP006", "UP035"]
-"vllm/core/**/*.py" = ["UP006", "UP035"]
 "vllm/engine/**/*.py" = ["UP006", "UP035"]
 "vllm/executor/**/*.py" = ["UP006", "UP035"]
 "vllm/worker/**/*.py" = ["UP006", "UP035"]
@ -117,7 +116,6 @@ files = [
    "vllm/*.py",
    "vllm/assets",
    "vllm/entrypoints",
-    "vllm/core",
    "vllm/inputs",
    "vllm/logging_utils",
    "vllm/multimodal",
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -11,7 +11,7 @@ from unittest.mock import Mock
 import pytest
 import torch

-from vllm import LLM, envs
+from vllm import LLM
 from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1

 from ..conftest import HfRunner, VllmRunner
@ -26,14 +26,6 @@ MODELS = [
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
    llm = LLM("distilbert/distilgpt2")
@ -76,12 +68,6 @@ def test_models(
    model_executor: str,
    enable_prompt_embeds: bool,
 ) -> None:
-    if not envs.VLLM_USE_V1:
-        if async_scheduling:
-            pytest.skip("async_scheduling only supported in v1.")
-        if model_executor != "uni":
-            pytest.skip("only test uniproc executor for v0.")
-
    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
        pytest.skip(
            f"{backend} does not support gemma2 with full context length.")
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@ -122,11 +122,12 @@ def test_cumem_with_cudagraph():
        # sleep mode with safetensors
        ("meta-llama/Llama-3.2-1B", True),
        # sleep mode with pytorch checkpoint
-        ("facebook/opt-125m", False),
+        ("facebook/opt-125m", True),
    ])
 def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        assert use_v1
+        m.setenv("VLLM_USE_V1", "1")
        free, total = torch.cuda.mem_get_info()
        used_bytes_baseline = total - free  # in case other process is running
        llm = LLM(model, enable_sleep_mode=True)
--- a/tests/build_cython.py
+++ b/tests/build_cython.py
@ -1,39 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import Cython.Compiler.Options
-from Cython.Build import cythonize
-from setuptools import setup
-
-Cython.Compiler.Options.annotate = True
-
-infiles = []
-
-infiles += [
-    "vllm/engine/llm_engine.py",
-    "vllm/transformers_utils/detokenizer.py",
-    "vllm/engine/output_processor/single_step.py",
-    "vllm/outputs.py",
-    "vllm/engine/output_processor/stop_checker.py",
-]
-
-infiles += [
-    "vllm/core/scheduler.py",
-    "vllm/sequence.py",
-    "vllm/core/block_manager.py",
-]
-
-infiles += [
-    "vllm/model_executor/layers/sampler.py",
-    "vllm/sampling_params.py",
-    "vllm/utils/__init__.py",
-]
-
-setup(ext_modules=cythonize(infiles,
-                            annotate=False,
-                            force=True,
-                            compiler_directives={
-                                'language_level': "3",
-                                'infer_types': True
-                            }))
-
-# example usage: python3 build_cython.py build_ext --inplace
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@ -54,8 +54,7 @@ def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
    # Use global backends
    global backend, backend_unfused

-    use_v1 = False  # can be made a param once V1 support added
-    monkeypatch.setenv("VLLM_USE_V1", str(int(use_v1)))
+    monkeypatch.setenv("VLLM_USE_V1", "1")
    monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", str(int(use_triton_fa)))

    # Prompt 4 seems too open-ended, differs between fused and unfused
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -19,6 +19,7 @@ import socket
 import tempfile
 import threading
 from collections.abc import Generator
+from contextlib import nullcontext
 from enum import Enum
 from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast

@ -45,14 +46,14 @@ from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
                              initialize_model_parallel)
-from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
-                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
+from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
+from vllm.logprobs import Logprob
 from vllm.multimodal.utils import fetch_image
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.sequence import Logprob
 from vllm.transformers_utils.utils import maybe_model_redirect
+from vllm.utils import set_default_torch_num_threads

 logger = init_logger(__name__)

@ -159,26 +160,6 @@ def cleanup_VLLM_USE_V1(monkeypatch):
        monkeypatch.delenv("VLLM_USE_V1")


-@pytest.fixture(params=[True, False])
-def run_with_both_engines(request, monkeypatch):
-    # Automatically runs tests twice, once with V1 and once without
-    use_v1 = request.param
-    # Tests decorated with `@skip_v1` are only run without v1
-    skip_v0 = request.node.get_closest_marker("skip_v0")
-    skip_v1 = request.node.get_closest_marker("skip_v1")
-
-    if use_v1:
-        if skip_v1:
-            pytest.skip("Skipping test on vllm V1")
-        monkeypatch.setenv('VLLM_USE_V1', '1')
-    else:
-        if skip_v0:
-            pytest.skip("Skipping test on vllm V0")
-        monkeypatch.setenv('VLLM_USE_V1', '0')
-
-    yield
-
-
@pytest.fixture(autouse=True)
 def init_test_http_connection():
    # pytest_asyncio may use a different event loop per test
@ -306,6 +287,35 @@ class HfRunner:
        is_cross_encoder: bool = False,
        skip_tokenizer_init: bool = False,
        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
+        # Set this to avoid hanging issue
+        default_torch_num_threads: Optional[int] = None,
+    ) -> None:
+        init_ctx = (nullcontext() if default_torch_num_threads is None else
+                    set_default_torch_num_threads(default_torch_num_threads))
+
+        with init_ctx:
+            self._init(
+                model_name=model_name,
+                dtype=dtype,
+                model_kwargs=model_kwargs,
+                trust_remote_code=trust_remote_code,
+                is_sentence_transformer=is_sentence_transformer,
+                is_cross_encoder=is_cross_encoder,
+                skip_tokenizer_init=skip_tokenizer_init,
+                auto_cls=auto_cls,
+            )
+
+    def _init(
+        self,
+        model_name: str,
+        dtype: str = "auto",
+        *,
+        model_kwargs: Optional[dict[str, Any]] = None,
+        trust_remote_code: bool = True,
+        is_sentence_transformer: bool = False,
+        is_cross_encoder: bool = False,
+        skip_tokenizer_init: bool = False,
+        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
    ) -> None:
        model_name = maybe_model_redirect(model_name)
        self.model_name = model_name
@ -714,26 +724,32 @@ class VllmRunner:
        enable_chunked_prefill: Optional[bool] = False,
        swap_space: int = 4,
        enforce_eager: Optional[bool] = False,
+        # Set this to avoid hanging issue
+        default_torch_num_threads: Optional[int] = None,
        **kwargs,
    ) -> None:
-        self.llm = LLM(
-            model=model_name,
-            runner=runner,
-            convert=convert,
-            tokenizer=tokenizer_name,
-            tokenizer_mode=tokenizer_mode,
-            trust_remote_code=trust_remote_code,
-            dtype=dtype,
-            seed=seed,
-            swap_space=swap_space,
-            enforce_eager=enforce_eager,
-            disable_log_stats=disable_log_stats,
-            tensor_parallel_size=tensor_parallel_size,
-            max_model_len=max_model_len,
-            block_size=block_size,
-            enable_chunked_prefill=enable_chunked_prefill,
-            **kwargs,
-        )
+        init_ctx = (nullcontext() if default_torch_num_threads is None else
+                    set_default_torch_num_threads(default_torch_num_threads))
+
+        with init_ctx:
+            self.llm = LLM(
+                model=model_name,
+                runner=runner,
+                convert=convert,
+                tokenizer=tokenizer_name,
+                tokenizer_mode=tokenizer_mode,
+                trust_remote_code=trust_remote_code,
+                dtype=dtype,
+                seed=seed,
+                swap_space=swap_space,
+                enforce_eager=enforce_eager,
+                disable_log_stats=disable_log_stats,
+                tensor_parallel_size=tensor_parallel_size,
+                max_model_len=max_model_len,
+                block_size=block_size,
+                enable_chunked_prefill=enable_chunked_prefill,
+                **kwargs,
+            )

    def get_inputs(
        self,
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@ -32,10 +32,6 @@ def _test_stopping(llm: LLM,
    assert output.stop_reason == expected_reason


-def _set_async_mode(llm, is_async):
-    llm.llm_engine.scheduler[0].use_async_output_proc = is_async
-
-
 def _stop_basic(llm):
    _test_stopping(llm,
                   stop=["."],
@ -103,40 +99,8 @@ def test_stop_strings():
    # async output processing below.
    llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)

-    if envs.VLLM_USE_V1:
-        _stop_basic(llm)
-    else:
-        _set_async_mode(llm, True)
-        _stop_basic(llm)
-
-        _set_async_mode(llm, False)
-        _stop_basic(llm)
-
-    if envs.VLLM_USE_V1:
-        _stop_multi_tokens(llm)
-    else:
-        _set_async_mode(llm, True)
-        _stop_multi_tokens(llm)
-
-        _set_async_mode(llm, False)
-        _stop_multi_tokens(llm)
-
-    if envs.VLLM_USE_V1:
-        _stop_partial_token(llm)
-    else:
-        _set_async_mode(llm, True)
-        _stop_partial_token(llm)
-
-        _set_async_mode(llm, False)
-        _stop_partial_token(llm)
-
-    if envs.VLLM_USE_V1:
-        # FIXME: this does not respect include_in_output=False
-        # _stop_token_id(llm)
-        pass
-    else:
-        _set_async_mode(llm, True)
-        _stop_token_id(llm)
-
-        _set_async_mode(llm, False)
-        _stop_token_id(llm)
+    _stop_basic(llm)
+    _stop_multi_tokens(llm)
+    _stop_partial_token(llm)
+    # FIXME: this does not respect include_in_output=False
+    # _stop_token_id(llm)
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@ -25,12 +25,6 @@ TOKEN_IDS = [
 ]


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
@pytest.fixture(scope="module")
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@ -6,14 +6,6 @@ import pytest
 from vllm import LLM


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def test_empty_prompt():
    llm = LLM(model="openai-community/gpt2", enforce_eager=True)
    with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@ -60,6 +60,7 @@ def create_dummy_embeds(num_tokens: int = 5) -> str:
    return base64.b64encode(buffer.getvalue()).decode('utf-8')


+@pytest.mark.skip("This test is skipped because it is flaky.")
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_completions_with_prompt_embeds(
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@ -432,7 +432,7 @@ def test_metrics_exist_run_batch(use_v1: bool):
            "--port",
            port,
        ],
-                                env={"VLLM_USE_V1": "1" if use_v1 else "0"})
+                                env={"VLLM_USE_V1": "1"})

        def is_server_up(url):
            try:
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@ -69,28 +69,20 @@ def generate_params():

@pytest.mark.parametrize("device, name, use_mla, block_size",
                         generate_params())
-@pytest.mark.parametrize("use_v1", [True, False])
 def test_env(
    device: str,
    name: str,
    use_mla: bool,
    block_size: int,
-    use_v1: bool,
    monkeypatch: pytest.MonkeyPatch,
 ):
    """Test attention backend selection with valid device-backend pairs."""
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv("VLLM_USE_V1", "1")
        m.setenv(STR_BACKEND_ENV_VAR, name)
        m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")

-        if name == "FLASHINFER" and not use_v1:
-            pytest.skip("FlashInfer backend is only available on V1 engine")
-
        if device == "cpu":
-            if not use_v1:
-                pytest.skip("CPU backend only supports V1")
-
            with patch("vllm.attention.selector.current_platform",
                       CpuPlatform()):
                backend = get_attn_backend(16, torch.float16, None, block_size,
@ -137,7 +129,7 @@ def test_env(
                                                   block_size,
                                                   False,
                                                   use_mla=use_mla)
-                        expected = f"{name}_VLLM_V1" if use_v1 else name
+                        expected = f"{name}_VLLM_V1"
                        assert backend.get_name() == expected
                else:
                    backend = get_attn_backend(16,
@ -146,7 +138,7 @@ def test_env(
                                               block_size,
                                               False,
                                               use_mla=use_mla)
-                    expected = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
+                    expected = "TRITON_ATTN_VLLM_V1"
                    assert backend.get_name() == expected

        elif device == "cuda":
@ -163,11 +155,7 @@ def test_env(
                    # - TRITON_MLA: fallback for other cases

                    if name == "CUTLASS_MLA":
-                        if not use_v1:
-                            # CUTLASS_MLA only supported on V1 engine
-                            pytest.skip(
-                                "CUTLASS_MLA only supported on V1 engine")
-                        elif block_size != 128:
+                        if block_size != 128:
                            # CUTLASS_MLA only supports block_size == 128
                            pytest.skip(
                                "CUTLASS_MLA only supports block_size 128")
@ -181,11 +169,7 @@ def test_env(
                            expected = "CUTLASS_MLA_VLLM_V1"
                            assert backend.get_name() == expected
                    elif name == "FLASHINFER_MLA":
-                        if not use_v1:
-                            # FlashInfer MLA only supported on V1 engine
-                            pytest.skip(
-                                "FlashInfer MLA only supported on V1 engine")
-                        elif block_size not in [32, 64]:
+                        if block_size not in [32, 64]:
                            # FlashInfer MLA only supports block_size 32 or 64
                            pytest.skip(
                                "FlashInfer MLA only supports block_size 32 "
@ -217,23 +201,17 @@ def test_env(
                                                           block_size,
                                                           False,
                                                           use_mla=use_mla)
-                                expected = f"{name}_VLLM_V1" if use_v1 else name
+                                expected = f"{name}_VLLM_V1"
                                assert backend.get_name() == expected
                    elif name == "FLASH_ATTN_MLA":
-                        if not use_v1:
-                            # FlashAttention MLA only supported on V1 engine
-                            pytest.skip(
-                                "FlashAttention MLA only supported on V1 engine"
-                            )
-                        else:
-                            backend = get_attn_backend(16,
-                                                       torch.float16,
-                                                       None,
-                                                       block_size,
-                                                       False,
-                                                       use_mla=use_mla)
-                            expected = "FLASH_ATTN_MLA"
-                            assert backend.get_name() == expected
+                        backend = get_attn_backend(16,
+                                                   torch.float16,
+                                                   None,
+                                                   block_size,
+                                                   False,
+                                                   use_mla=use_mla)
+                        expected = "FLASH_ATTN_MLA"
+                        assert backend.get_name() == expected
                    else:
                        # TRITON_MLA or other fallback
                        backend = get_attn_backend(16,
@ -242,8 +220,7 @@ def test_env(
                                                   block_size,
                                                   False,
                                                   use_mla=use_mla)
-                        expected = ("TRITON_MLA_VLLM_V1"
-                                    if use_v1 else "TRITON_MLA")
+                        expected = "TRITON_MLA_VLLM_V1"
                        assert backend.get_name() == expected
                elif name == "FLASHINFER":
                    backend = get_attn_backend(16,
@ -252,7 +229,7 @@ def test_env(
                                               block_size,
                                               False,
                                               use_mla=use_mla)
-                    expected = "FLASHINFER_VLLM_V1" if use_v1 else name
+                    expected = "FLASHINFER_VLLM_V1"
                    assert backend.get_name() == expected
                else:
                    backend = get_attn_backend(32,
@ -261,36 +238,30 @@ def test_env(
                                               block_size,
                                               False,
                                               use_mla=use_mla)
-                    expected = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+                    expected = "FLASH_ATTN_VLLM_V1"
                    assert backend.get_name() == expected

-                    if use_v1:
-                        backend = get_attn_backend(16,
-                                                   torch.float16,
-                                                   None,
-                                                   block_size,
-                                                   False,
-                                                   use_mla=use_mla)
-                        assert backend.get_name() == "FLEX_ATTENTION", (
-                            "Should fallback to FlexAttention if head size is "
-                            "not supported by FlashAttention")
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               None,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    assert backend.get_name() == "FLEX_ATTENTION", (
+                        "Should fallback to FlexAttention if head size is "
+                        "not supported by FlashAttention")


@pytest.mark.parametrize("device", ["cpu", "cuda"])
-@pytest.mark.parametrize("use_v1", [True, False])
 def test_fp32_fallback(
    device: str,
-    use_v1: bool,
    monkeypatch: pytest.MonkeyPatch,
 ):
    """Test attention backend selection with fp32."""
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv("VLLM_USE_V1", "1")

        if device == "cpu":
-            if not use_v1:
-                pytest.skip("CPU backend only supports V1")
-
            with patch("vllm.attention.selector.current_platform",
                       CpuPlatform()):
                backend = get_attn_backend(16, torch.float32, None, 16, False)
@ -300,8 +271,7 @@ def test_fp32_fallback(
            with patch("vllm.attention.selector.current_platform",
                       CudaPlatform()):
                backend = get_attn_backend(16, torch.float32, None, 16, False)
-            assert (backend.get_name() == "FLEX_ATTENTION"
-                    if use_v1 else "XFORMERS")
+            assert backend.get_name() == "FLEX_ATTENTION"


 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
@ -357,12 +327,11 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
        assert backend.get_name() != STR_FLASH_ATTN_VAL


-@pytest.mark.parametrize("use_v1", [True, False])
-def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
+def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
    """Test that invalid attention backend names raise ValueError."""
    with monkeypatch.context() as m, patch(
            "vllm.attention.selector.current_platform", CudaPlatform()):
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv("VLLM_USE_V1", "1")
        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)

        # Should raise ValueError for invalid backend
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@ -17,7 +17,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               MergedColumnParallelLinear,
                                               RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.interfaces import SupportsLoRA
 from vllm.platforms import current_platform
@ -97,7 +96,6 @@ def dummy_model() -> nn.Module:
            # Special handling for lm_head & sampler
            ("lm_head", ParallelLMHead(512, 10)),
            ("logits_processor", LogitsProcessor(512)),
-            ("sampler", Sampler())
        ]))
    model.config = MagicMock()
    model.embedding_modules = {"lm_head": "lm_head"}
@ -125,7 +123,6 @@ def dummy_model_gate_up() -> nn.Module:
            # Special handling for lm_head & sampler
            ("lm_head", ParallelLMHead(512, 10)),
            ("logits_processor", LogitsProcessor(512)),
-            ("sampler", Sampler())
        ]))
    model.config = MagicMock()
    model.packed_modules_mapping = {
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@ -6,10 +6,10 @@ Script to test add_lora, remove_lora, pin_lora, list_loras functions.
 import pytest

 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args)
 from vllm.lora.request import LoRARequest
+from vllm.v1.engine.llm_engine import LLMEngine

 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@ -15,7 +15,8 @@ from ...utils import check_logprobs_close
 # have a clean way to fall back, so we fail with
 # a clear msg when it happens.
 # https://github.com/vllm-project/vllm/issues/14524
-REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
+# NOTE(woosuk): Skipping these tests until V1 supports them.
+# REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]

 # This list contains the model that are using AITER kernel.
 # Skip model that are not using AITER tests.
@ -113,9 +114,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")

-    if model in REQUIRES_V0:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
    if use_rocm_aiter and (model in AITER_MODEL_LIST):
        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
    elif use_rocm_aiter and model not in AITER_MODEL_LIST:
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@ -8,7 +8,7 @@ from tests.utils import multi_gpu_test
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sampling_params import SamplingParams

-from ...utils import check_logprobs_close, check_outputs_equal
+from ...utils import check_logprobs_close

 # Mark all tests as hybrid
 pytestmark = pytest.mark.hybrid_model
@ -88,15 +88,6 @@ def test_models(
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        if model not in V0_UNSUPPORTED_MODELS:
-            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                    example_prompts, max_tokens, num_logprobs)
-        else:
-            vllm_v0_outputs = None
-
    if model in V1_SUPPORTED_MODELS:
        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
@ -104,14 +95,6 @@ def test_models(
    else:
        vllm_v1_outputs = None

-    if vllm_v0_outputs is not None:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_v0_outputs,
-            name_0="hf",
-            name_1="vllm-v0",
-        )
-
    if model in V1_SUPPORTED_MODELS:
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
@ -157,45 +140,6 @@ def test_batching(
    )


-@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
-def test_chunked_prefill(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    max_tokens: int,
-    num_logprobs: int,
-    chunked_prefill_token_size: int,
-    monkeypatch,
-) -> None:
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        with vllm_runner(model,
-                         enable_chunked_prefill=True,
-                         max_num_batched_tokens=max_num_batched_tokens,
-                         max_num_seqs=max_num_seqs) as vllm_model:
-            chunked = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-
-        with vllm_runner(model,
-                         enable_chunked_prefill=False,
-                         max_num_seqs=max_num_seqs) as vllm_model:
-            non_chunked = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-
-        check_logprobs_close(
-            outputs_0_lst=chunked,
-            outputs_1_lst=non_chunked,
-            name_0="chunked",
-            name_1="non_chunked",
-        )
-
-
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@pytest.mark.parametrize("max_tokens", [10])
 def test_chunked_prefill_with_parallel_sampling(
@ -257,38 +201,6 @@ def test_mamba_cache_cg_padding(
            "Could be related to mamba cache not padded correctly")


-@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
-@pytest.mark.parametrize("max_tokens", [20])
-def test_models_preemption_recompute(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    max_tokens: int,
-    monkeypatch,
-) -> None:
-    """
-    Tests that outputs are identical with and w/o preemptions (recompute).
-    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-            scheduler = vllm_model.llm.llm_engine.scheduler[0]
-            scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
-            preempt_vllm_outputs = vllm_model.generate_greedy(
-                example_prompts, max_tokens)
-
-            scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=preempt_vllm_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="vllm_preepmtions",
-            name_1="vllm",
-        )
-
-
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
    vllm_runner,
@ -386,27 +298,10 @@ def test_full_cuda_graph(
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        if model not in V0_UNSUPPORTED_MODELS:
-            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                    example_prompts, max_tokens, num_logprobs)
-        else:
-            vllm_v0_outputs = None
-
    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

-    if vllm_v0_outputs is not None:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_v0_outputs,
-            name_0="hf",
-            name_1="vllm-v0",
-        )
-
    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_v1_outputs,
@ -442,27 +337,12 @@ def test_fp32_cache_state(
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        with vllm_runner(model,
-                         max_num_seqs=MAX_NUM_SEQS,
-                         **{cache_dtype_param: "float32"}) as vllm_model:
-            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-
    with vllm_runner(model,
                     max_num_seqs=MAX_NUM_SEQS,
                     **{cache_dtype_param: "float32"}) as vllm_model:
        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_v0_outputs,
-        name_0="hf",
-        name_1="vllm-v0",
-    )
-
    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_v1_outputs,
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os

 import pytest
 import torch
@ -82,7 +81,7 @@ def test_prm_models(
    check_transformers_version("Qwen/Qwen2.5-Math-PRM-7B",
                               max_transformers_version="4.53.2")

-    if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
+    if current_platform.is_cpu():
        pytest.skip("CPU only supports V1")

    if current_platform.is_rocm():
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@ -32,13 +32,6 @@ from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
 if current_platform.is_rocm():
    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"

-REQUIRES_V0_MODELS = [
-    # V1 Test: not enough KV cache space in C1.
-    "fuyu",
-    # V1 Test: Deadlock issue when processing mm_inputs
-    "llava-onevision-transformers",
-]
-
 # yapf: disable
 COMMON_BROADCAST_SETTINGS = {
    "test_type": VLMTestType.IMAGE,
@ -186,8 +179,11 @@ VLM_TEST_SETTINGS = {
        image_size_factors=[(0.25, 0.5, 1.0)],
        vllm_runner_kwargs={
            "model_impl": "transformers",
+            "default_torch_num_threads": 1,
        },
-        marks=[pytest.mark.core_model],
+        # FIXME: Investigate why the test hangs
+        # when processing the 3rd prompt in vLLM
+        marks=[pytest.mark.core_model, pytest.mark.skip(reason="Test hangs")],
    ),
    "idefics3-transformers": VLMTestInfo(
        models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
@ -320,6 +316,7 @@ VLM_TEST_SETTINGS = {
        vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[large_gpu_mark(min_gb=32)],
    ),
    "gemma3": VLMTestInfo(
        models=["google/gemma-3-4b-it"],
@ -861,13 +858,14 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
        test_type=VLMTestType.IMAGE,
        create_new_process_for_each_test=False,
    ))
-def test_single_image_models(tmp_path: PosixPath, model_type: str,
-                             test_case: ExpandableVLMTestArgs,
-                             hf_runner: type[HfRunner],
-                             vllm_runner: type[VllmRunner],
-                             image_assets: ImageTestAssets, monkeypatch):
-    if model_type in REQUIRES_V0_MODELS:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
+def test_single_image_models(
+    tmp_path: PosixPath,
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
        tmp_path=tmp_path,
@ -886,13 +884,14 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
        test_type=VLMTestType.MULTI_IMAGE,
        create_new_process_for_each_test=False,
    ))
-def test_multi_image_models(tmp_path: PosixPath, model_type: str,
-                            test_case: ExpandableVLMTestArgs,
-                            hf_runner: type[HfRunner],
-                            vllm_runner: type[VllmRunner],
-                            image_assets: ImageTestAssets, monkeypatch):
-    if model_type in REQUIRES_V0_MODELS:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
+def test_multi_image_models(
+    tmp_path: PosixPath,
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
        tmp_path=tmp_path,
@ -911,13 +910,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
        test_type=VLMTestType.EMBEDDING,
        create_new_process_for_each_test=False,
    ))
-def test_image_embedding_models(model_type: str,
-                                test_case: ExpandableVLMTestArgs,
-                                hf_runner: type[HfRunner],
-                                vllm_runner: type[VllmRunner],
-                                image_assets: ImageTestAssets, monkeypatch):
-    if model_type in REQUIRES_V0_MODELS:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
+def test_image_embedding_models(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
        model_test_info=model_test_info,
@ -935,11 +934,13 @@ def test_image_embedding_models(model_type: str,
        test_type=VLMTestType.VIDEO,
        create_new_process_for_each_test=False,
    ))
-def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
-                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
-                      video_assets: VideoTestAssets, monkeypatch):
-    if model_type in REQUIRES_V0_MODELS:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
+def test_video_models(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    video_assets: VideoTestAssets,
+):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
        model_test_info=model_test_info,
@ -957,11 +958,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=False,
    ))
-def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs,
-                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
-                      audio_assets: AudioTestAssets, monkeypatch):
-    if model_type in REQUIRES_V0_MODELS:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
+def test_audio_models(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    audio_assets: AudioTestAssets,
+):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_audio_test(
        model_test_info=model_test_info,
@ -984,10 +987,7 @@ def test_custom_inputs_models(
    test_case: ExpandableVLMTestArgs,
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
-    monkeypatch,
 ):
-    if model_type in REQUIRES_V0_MODELS:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(
        model_test_info=model_test_info,
@ -1006,13 +1006,14 @@ def test_custom_inputs_models(
        create_new_process_for_each_test=True,
    ))
@create_new_process_for_each_test()
-def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
-                                   test_case: ExpandableVLMTestArgs,
-                                   hf_runner: type[HfRunner],
-                                   vllm_runner: type[VllmRunner],
-                                   image_assets: ImageTestAssets, monkeypatch):
-    if model_type in REQUIRES_V0_MODELS:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
+def test_single_image_models_heavy(
+    tmp_path: PosixPath,
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_single_image_test(
        tmp_path=tmp_path,
@ -1032,13 +1033,14 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
        create_new_process_for_each_test=True,
    ))
@create_new_process_for_each_test()
-def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
-                                  test_case: ExpandableVLMTestArgs,
-                                  hf_runner: type[HfRunner],
-                                  vllm_runner: type[VllmRunner],
-                                  image_assets: ImageTestAssets, monkeypatch):
-    if model_type in REQUIRES_V0_MODELS:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
+def test_multi_image_models_heavy(
+    tmp_path: PosixPath,
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_multi_image_test(
        tmp_path=tmp_path,
@ -1058,14 +1060,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
        create_new_process_for_each_test=True,
    ))
@create_new_process_for_each_test()
-def test_image_embedding_models_heavy(model_type: str,
-                                      test_case: ExpandableVLMTestArgs,
-                                      hf_runner: type[HfRunner],
-                                      vllm_runner: type[VllmRunner],
-                                      image_assets: ImageTestAssets,
-                                      monkeypatch):
-    if model_type in REQUIRES_V0_MODELS:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
+def test_image_embedding_models_heavy(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_embedding_test(
        model_test_info=model_test_info,
@ -1083,12 +1084,13 @@ def test_image_embedding_models_heavy(model_type: str,
        test_type=VLMTestType.VIDEO,
        create_new_process_for_each_test=True,
    ))
-def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
-                            hf_runner: type[HfRunner],
-                            vllm_runner: type[VllmRunner],
-                            video_assets: VideoTestAssets, monkeypatch):
-    if model_type in REQUIRES_V0_MODELS:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
+def test_video_models_heavy(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    video_assets: VideoTestAssets,
+):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_video_test(
        model_test_info=model_test_info,
@ -1106,12 +1108,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=True,
    ))
-def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
-                            hf_runner: type[HfRunner],
-                            vllm_runner: type[VllmRunner],
-                            audio_assets: AudioTestAssets, monkeypatch):
-    if model_type in REQUIRES_V0_MODELS:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
+def test_audio_models_heavy(
+    model_type: str,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    audio_assets: AudioTestAssets,
+):
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_audio_test(
        model_test_info=model_test_info,
@ -1135,10 +1138,7 @@ def test_custom_inputs_models_heavy(
    test_case: ExpandableVLMTestArgs,
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
-    monkeypatch,
 ):
-    if model_type in REQUIRES_V0_MODELS:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
    model_test_info = VLM_TEST_SETTINGS[model_type]
    runners.run_custom_inputs_test(
        model_test_info=model_test_info,
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@ -7,8 +7,8 @@ from typing import Optional
 import pytest
 from transformers import AutoModelForSpeechSeq2Seq

+from vllm.logprobs import SampleLogprobs
 from vllm.lora.request import LoRARequest
-from vllm.sequence import SampleLogprobs

 from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput,
                          VllmRunner)
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@ -12,10 +12,10 @@ from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer

 from vllm.assets.image import ImageAsset
+from vllm.logprobs import SampleLogprobs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import convert_image_mode, rescale_image_size
 from vllm.platforms import current_platform
-from vllm.sequence import SampleLogprobs

 from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
                          PromptImageInput, VllmRunner)
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@ -12,13 +12,12 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
 from transformers import AutoProcessor

-from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
+from vllm import SamplingParams, TextPrompt, TokensPrompt
+from vllm.logprobs import Logprob, SampleLogprobs
 from vllm.multimodal import MultiModalDataBuiltins
-from vllm.multimodal.inputs import PlaceholderRange
-from vllm.sequence import Logprob, SampleLogprobs

 from ....utils import VLLM_PATH, large_gpu_test
-from ...utils import check_logprobs_close, dummy_hf_overrides
+from ...utils import check_logprobs_close

 if TYPE_CHECKING:
    from _typeshed import StrPath
@ -185,47 +184,3 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
                         outputs_1_lst=logprobs,
                         name_0="h100_ref",
                         name_1="output")
-
-
-@pytest.mark.parametrize(
-    "image_urls,expected_ranges",
-    [(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]),
-     (IMG_URLS[1:4], [
-         PlaceholderRange(offset=11, length=266),
-         PlaceholderRange(offset=277, length=1056),
-         PlaceholderRange(offset=1333, length=418)
-     ])])
-def test_multi_modal_placeholders(vllm_runner, image_urls: list[str],
-                                  expected_ranges: list[PlaceholderRange],
-                                  local_asset_server, monkeypatch) -> None:
-    local_image_urls = [local_asset_server.url_for(u) for u in image_urls]
-    prompt = _create_engine_inputs_hf(local_image_urls)
-
-    # This placeholder checking test only works with V0 engine
-    # where `multi_modal_placeholders` is returned with `RequestOutput`
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-    with vllm_runner(
-            "mistral-community/pixtral-12b",
-            max_model_len=8192,
-            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
-            load_format="dummy",
-            hf_overrides=dummy_hf_overrides,
-    ) as vllm_model:
-        outputs = vllm_model.llm.generate(prompt)
-
-        assert len(outputs) == 1, f"{len(outputs)=}"
-        output: RequestOutput = outputs[0]
-        assert hasattr(output,
-                       "multi_modal_placeholders"), f"{output.__dict__=}"
-        assert "image" in output.multi_modal_placeholders, \
-            f"{output.multi_modal_placeholders.keys()=}"
-        image_placeholder_ranges: list[
-            PlaceholderRange] = output.multi_modal_placeholders["image"]
-        assert len(image_placeholder_ranges) == len(
-            expected_ranges), f"{image_placeholder_ranges=}"
-        for real_range, expected_range in zip(image_placeholder_ranges,
-                                              expected_ranges):
-            assert real_range.offset == expected_range.offset, \
-                f"{real_range=} {expected_range=}"
-            assert real_range.length == expected_range.length, \
-                f"{real_range=} {expected_range=}"
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@ -10,7 +10,6 @@ from PIL import Image

 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
-from vllm.utils import set_default_torch_num_threads

 from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
                          PromptVideoInput, VllmRunner)
@ -264,8 +263,7 @@ def run_embedding_input_test(
    processor = AutoProcessor.from_pretrained(model)

    # max_model_len should be greater than image_feature_size
-    with set_default_torch_num_threads(1):
-        vllm_model = vllm_runner(
+    with vllm_runner(
            model,
            runner="generate",
            max_model_len=4000,
@ -277,9 +275,8 @@ def run_embedding_input_test(
            },
            tensor_parallel_size=tensor_parallel_size,
            distributed_executor_backend=distributed_executor_backend,
-        )
-
-    with vllm_model:
+            default_torch_num_threads=1,
+    ) as vllm_model:
        outputs_per_case_for_original_input = [
            vllm_model.generate_greedy_logprobs(prompts,
                                                max_tokens,
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@ -19,7 +19,7 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
                          GenerationConfig, GenerationMixin)
 from transformers.video_utils import VideoMetadata

-from vllm.sequence import SampleLogprobs
+from vllm.logprobs import SampleLogprobs
 from vllm.utils import is_list_of

 from .....conftest import HfRunner, ImageAsset, ImageTestAssets
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@ -12,7 +12,7 @@ from transformers import AutoModelForCausalLM
 from transformers.models.auto.auto_factory import _BaseAutoModelClass

 from vllm.config import RunnerOption
-from vllm.sequence import SampleLogprobs
+from vllm.logprobs import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer

 from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
--- a/tests/models/multimodal/pooling/test_prithvi_mae.py
+++ b/tests/models/multimodal/pooling/test_prithvi_mae.py
@ -4,8 +4,6 @@
 import pytest
 import torch

-from vllm.utils import set_default_torch_num_threads
-
 from ....conftest import VllmRunner


@ -30,19 +28,17 @@ def _run_test(
        } for _ in range(10)
    ]

-    with (
-            set_default_torch_num_threads(1),
-            vllm_runner(
-                model,
-                runner="pooling",
-                dtype=torch.float16,
-                enforce_eager=True,
-                skip_tokenizer_init=True,
-                # Limit the maximum number of sequences to avoid the
-                # test going OOM during the warmup run
-                max_num_seqs=32,
-            ) as vllm_model,
-    ):
+    with vllm_runner(
+            model,
+            runner="pooling",
+            dtype="half",
+            enforce_eager=True,
+            skip_tokenizer_init=True,
+            # Limit the maximum number of sequences to avoid the
+            # test going OOM during the warmup run
+            max_num_seqs=32,
+            default_torch_num_threads=1,
+    ) as vllm_model:
        vllm_model.encode(prompt)


--- a/tests/models/quantization/test_awq.py
+++ b/tests/models/quantization/test_awq.py
@ -45,12 +45,15 @@ def run_awq_test(
    # will hurt multiprocessing backend with fork method (the default method).

    # max_model_len should be greater than image_feature_size
-    with vllm_runner(source_model,
-                     max_model_len=4096,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+            source_model,
+            max_model_len=4096,
+            dtype=dtype,
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            enforce_eager=True,
+            default_torch_num_threads=1,
+    ) as vllm_model:
        source_outputs_per_image = [
            vllm_model.generate_greedy_logprobs(prompts,
                                                max_tokens,
@ -59,13 +62,16 @@ def run_awq_test(
            for prompts, images in inputs_per_image
        ]

-    with vllm_runner(quant_model,
-                     quantization="awq",
-                     max_model_len=4096,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+            quant_model,
+            quantization="awq",
+            max_model_len=4096,
+            dtype=dtype,
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            enforce_eager=True,
+            default_torch_num_threads=1,
+    ) as vllm_model:
        quant_outputs_per_image = [
            vllm_model.generate_greedy_logprobs(prompts,
                                                max_tokens,
@ -108,12 +114,8 @@ def run_awq_test(
@pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode()
 def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
-                    size_factors, dtype, max_tokens, num_logprobs,
-                    monkeypatch) -> None:
+                    size_factors, dtype, max_tokens, num_logprobs) -> None:

-    # Test V1: this test hangs during setup on single-scale input.
-    # TODO: figure out why and re-enable this on V1.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
    run_awq_test(
        vllm_runner,
        image_assets,
--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@ -5,10 +5,7 @@
 Run `pytest tests/quantization/test_bitsandbytes.py`.
 '''

-import gc
-
 import pytest
-import torch
 from transformers import BitsAndBytesConfig

 from tests.quantization.utils import is_quant_method_supported
@ -131,12 +128,15 @@ def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
    ))
    with vllm_runner(model_name,
                     quantization='bitsandbytes',
-                     enforce_eager=False) as llm:
+                     enforce_eager=False,
+                     default_torch_num_threads=1) as llm:
        vllm_outputs = llm.generate_greedy_logprobs(example_prompts,
                                                    max_tokens=32,
                                                    num_logprobs=5)

-    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
+    with hf_runner(model_name,
+                   model_kwargs=hf_model_kwargs,
+                   default_torch_num_threads=1) as llm:
        transformers_outputs = llm.generate_greedy_logprobs_limit(
            example_prompts, max_tokens=32, num_logprobs=5)
    check_logprobs_close(
@ -174,7 +174,8 @@ def test_4bit_bnb_embedding_model(
                     runner="pooling",
                     dtype=dtype,
                     gpu_memory_utilization=0.5,
-                     quantization="bitsandbytes") as vllm_model:
+                     quantization="bitsandbytes",
+                     default_torch_num_threads=1) as vllm_model:
        vllm_outputs = vllm_model.embed(example_prompts)

    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
@ -184,6 +185,7 @@ def test_4bit_bnb_embedding_model(
            dtype=dtype,
            model_kwargs=hf_model_kwargs,
            is_sentence_transformer=True,
+            default_torch_num_threads=1,
    ) as hf_model:
        hf_outputs = hf_model.encode(example_prompts)

@ -222,26 +224,22 @@ def validate_generated_texts(hf_runner,
    with vllm_runner(model_name,
                     quantization=None if pre_quant else 'bitsandbytes',
                     tensor_parallel_size=vllm_tp_size,
-                     enforce_eager=False) as llm:
+                     enforce_eager=False,
+                     default_torch_num_threads=1) as llm:

        vllm_outputs = llm.generate_greedy(prompts, max_tokens)
        vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")

-    # Clean up the GPU memory for the next test
-    gc.collect()
-    torch.cuda.empty_cache()
-
    if hf_model_kwargs is None:
        hf_model_kwargs = {}

    # Run with HF runner
-    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
+    with hf_runner(model_name,
+                   model_kwargs=hf_model_kwargs,
+                   default_torch_num_threads=1) as llm:
        hf_outputs = llm.generate_greedy(prompts, max_tokens)
        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")

-    # Clean up the GPU memory for the next test
-    gc.collect()
-    torch.cuda.empty_cache()
    # Compare the generated strings
    for hf_log, vllm_log in zip(hf_logs, vllm_logs):
        hf_str = hf_log["generated_text"]
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@ -32,13 +32,10 @@ from ..utils import check_logprobs_close
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
@pytest.mark.parametrize("enforce_eager", [True])
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1])
-# Due to low-precision numerical divergence, this test is too sensitive for
-# the async postprocessor
-@pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_models(
    vllm_runner,
    example_prompts,
@ -49,7 +46,6 @@ def test_models(
    enforce_eager: bool,
    backend: str,
    tensor_parallel_size: int,
-    disable_async_output_proc: bool,
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
@ -61,6 +57,9 @@ def test_models(
        pytest.skip(
            f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")

+    if not current_platform.is_kv_cache_dtype_supported(kv_cache_dtype, None):
+        pytest.skip(f"{kv_cache_dtype} is not supported on this platform.")
+
    with monkeypatch.context() as m:
        m.setenv("TOKENIZERS_PARALLELISM", 'true')
        m.setenv(STR_BACKEND_ENV_VAR, backend)
@ -74,7 +73,6 @@ def test_models(
                tensor_parallel_size=tensor_parallel_size,
                enforce_eager=enforce_eager,
                kv_cache_dtype="auto",
-                disable_async_output_proc=disable_async_output_proc,
        ) as vllm_model:
            baseline_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, NUM_LOG_PROBS)
@ -85,7 +83,6 @@ def test_models(
                tensor_parallel_size=tensor_parallel_size,
                enforce_eager=enforce_eager,
                kv_cache_dtype=kv_cache_dtype,
-                disable_async_output_proc=disable_async_output_proc,
        ) as vllm_model:
            test_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, NUM_LOG_PROBS)
@ -110,9 +107,6 @@ def test_models(
    ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
-# Due to low-precision numerical divergence, this test is too sensitive for
-# the async postprocessor
-@pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_cpu_models(
    vllm_runner,
    example_prompts,
@ -120,7 +114,6 @@ def test_cpu_models(
    base_model: str,
    test_model: str,
    max_tokens: int,
-    disable_async_output_proc: bool,
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
@ -138,7 +131,6 @@ def test_cpu_models(
                max_model_len=MAX_MODEL_LEN,
                dtype="bfloat16",
                kv_cache_dtype="auto",
-                disable_async_output_proc=disable_async_output_proc,
        ) as vllm_model:
            baseline_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, NUM_LOG_PROBS)
@ -148,7 +140,6 @@ def test_cpu_models(
                max_model_len=MAX_MODEL_LEN,
                dtype="bfloat16",
                kv_cache_dtype=kv_cache_dtype,
-                disable_async_output_proc=disable_async_output_proc,
        ) as vllm_model:
            test_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, NUM_LOG_PROBS)
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@ -7,7 +7,6 @@ from unittest.mock import patch
 import pytest

 from vllm import LLM
-from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
 from vllm.utils import GiB_bytes
 from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
 from vllm.v1.engine.core import EngineCore as V1EngineCore
@ -61,10 +60,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
                                  False))

    # Avoid calling model.forward()
-    def _initialize_kv_caches_v0(self) -> None:
-        self.cache_config.num_gpu_blocks = 0
-        self.cache_config.num_cpu_blocks = 0
-
    def _initialize_kv_caches_v1(self, vllm_config):
        kv_cache_specs = self.model_executor.get_kv_cache_specs()
        scheduler_kv_cache_config = get_kv_cache_configs(
@ -76,12 +71,12 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
        # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
        return 1, 0, scheduler_kv_cache_config

-    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
-                       _initialize_kv_caches_v0),
-          patch.object(V1EngineCore, "_initialize_kv_caches",
+    with (patch.object(V1EngineCore, "_initialize_kv_caches",
                       _initialize_kv_caches_v1), monkeypatch.context() as m):
        if model_info.v0_only:
-            m.setenv("VLLM_USE_V1", "0")
+            # NOTE(woosuk): skip the test for V0-only models
+            return
+
        if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"):
            # Phi4FlashForCausalLM and MotifForCausalLM
            # only supports DIFFERENTIAL_FLASH_ATTN backend
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@ -42,6 +42,7 @@ def test_oot_registration_text_generation(
            assert rest == ""


+@pytest.mark.skip(reason="This test is skipped because it failed on V1.")
@create_new_process_for_each_test()
 def test_oot_registration_embedding(
    monkeypatch: pytest.MonkeyPatch,
@ -62,6 +63,7 @@ def test_oot_registration_embedding(
 image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")


+@pytest.mark.skip(reason="This test is skipped because it failed on V1.")
@create_new_process_for_each_test()
 def test_oot_registration_multimodal(
    monkeypatch: pytest.MonkeyPatch,
--- a/tests/models/test_terratorch.py
+++ b/tests/models/test_terratorch.py
@ -5,7 +5,6 @@ import pytest
 import torch

 from tests.conftest import VllmRunner
-from vllm.utils import set_default_torch_num_threads


@pytest.mark.parametrize(
@ -25,19 +24,17 @@ def test_inference(
    prompt = dict(prompt_token_ids=[1],
                  multi_modal_data=dict(pixel_values=pixel_values,
                                        location_coords=location_coords))
-    with (
-            set_default_torch_num_threads(1),
-            vllm_runner(
-                model,
-                runner="pooling",
-                dtype=torch.float16,
-                enforce_eager=True,
-                skip_tokenizer_init=True,
-                # Limit the maximum number of sequences to avoid the
-                # test going OOM during the warmup run
-                max_num_seqs=32,
-            ) as vllm_model,
-    ):
+    with vllm_runner(
+            model,
+            runner="pooling",
+            dtype="half",
+            enforce_eager=True,
+            skip_tokenizer_init=True,
+            # Limit the maximum number of sequences to avoid the
+            # test going OOM during the warmup run
+            max_num_seqs=32,
+            default_torch_num_threads=1,
+    ) as vllm_model:

        vllm_output = vllm_model.llm.encode(prompt)
        assert torch.equal(
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@ -12,7 +12,7 @@ from transformers import PretrainedConfig

 from vllm.config import ModelConfig, ModelDType, RunnerOption
 from vllm.inputs import InputContext
-from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
+from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs

 from .registry import HF_EXAMPLE_MODELS

--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@ -9,7 +9,6 @@ from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder,
                                              LlavaForConditionalGeneration,
                                              LlavaMultiModalProcessor,
                                              LlavaProcessingInfo)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY


@ -18,11 +17,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
                                        dummy_inputs=LlavaDummyInputsBuilder)
 class MyLlava(LlavaForConditionalGeneration):

-    def compute_logits(
-            self, hidden_states: torch.Tensor,
-            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+    def compute_logits(self,
+                       hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
        # this dummy model always predicts the first token
-        logits = super().compute_logits(hidden_states, sampling_metadata)
+        logits = super().compute_logits(hidden_states)
        if logits is not None:
            logits.zero_()
            logits[:, 0] += 1.0
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
@ -6,16 +6,14 @@ from typing import Optional
 import torch

 from vllm.model_executor.models.opt import OPTForCausalLM
-from vllm.model_executor.sampling_metadata import SamplingMetadata


 class MyOPTForCausalLM(OPTForCausalLM):

-    def compute_logits(
-            self, hidden_states: torch.Tensor,
-            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+    def compute_logits(self,
+                       hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
        # this dummy model always predicts the first token
-        logits = super().compute_logits(hidden_states, sampling_metadata)
+        logits = super().compute_logits(hidden_states)
        if logits is not None:
            logits.zero_()
            logits[:, 0] += 1.0
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@ -7,15 +7,6 @@ import torch
 from vllm.plugins import load_general_plugins


-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
 def test_platform_plugins():
    # simulate workload by running an example
    import runpy
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@ -3,47 +3,18 @@

 import pytest

-from vllm.core.scheduler import Scheduler
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
-from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.engine.llm_engine import LLMEngine


-class DummyV0Scheduler(Scheduler):
-
-    def schedule(self):
-        raise Exception("Exception raised by DummyV0Scheduler")
-
-
-class DummyV1Scheduler(V1Scheduler):
+class DummyV1Scheduler(Scheduler):

    def schedule(self):
        raise Exception("Exception raised by DummyV1Scheduler")


-def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        with pytest.raises(Exception) as exception_info:
-
-            engine_args = EngineArgs(
-                model="facebook/opt-125m",
-                enforce_eager=True,  # reduce test time
-                scheduler_cls=DummyV0Scheduler,
-            )
-
-            engine = LLMEngine.from_engine_args(engine_args=engine_args)
-
-            sampling_params = SamplingParams(max_tokens=1)
-            engine.add_request("0", "foo", sampling_params)
-            engine.step()
-
-        assert str(
-            exception_info.value) == "Exception raised by DummyV0Scheduler"
-
-
 def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
@ -59,7 +30,7 @@ def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
                scheduler_cls=DummyV1Scheduler,
            )

-            engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
+            engine = LLMEngine.from_engine_args(engine_args=engine_args)

            sampling_params = SamplingParams(max_tokens=1)
            engine.add_request("0", "foo", sampling_params)
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@ -357,6 +357,9 @@ def test_compressed_tensors_fp8(vllm_runner):
        assert output


+@pytest.mark.skipif(
+    not current_platform.is_kv_cache_dtype_supported("fp8", None),
+    reason="FP8 KV cache is not supported on this device.")
@pytest.mark.skipif(not current_platform.is_cuda(),
                    reason="This test is skipped on non-CUDA platform.")
 def test_compressed_tensors_kv_cache(vllm_runner):
@ -738,4 +741,4 @@ def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt,
    with vllm_runner(model, enforce_eager=True) as llm:
        perplexity = llm.generate_prompt_perplexity([prompt])[0]
        print(perplexity)
-        assert perplexity <= exp_perplexity
+        assert perplexity <= exp_perplexity
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@ -10,13 +10,6 @@ from transformers import AutoModelForSeq2SeqLM

 from vllm.assets.audio import AudioAsset

-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
 # FIXME(zhuohan): The test can not pass if we:
 #   1. Increase max_tokens to 256.
 #   2. Increase beam_width to 8.
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@ -9,13 +9,6 @@ import pytest

 from vllm import SamplingParams

-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
 # We also test with llama because it has generation_config to specify EOS
 # (past regression).
 MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"]
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@ -8,12 +8,6 @@ from vllm import SamplingParams
 MODELS = ["distilbert/distilgpt2"]


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
 def test_ranks(
--- a/tests/speculative_decoding/speculators/test_eagle3.py
+++ b/tests/speculative_decoding/speculators/test_eagle3.py
@ -3,38 +3,52 @@
 import pytest
 import torch

+from vllm.config import SpeculativeConfig
 from vllm.model_executor.models.interfaces import supports_eagle3


-@pytest.mark.parametrize(
-    "model_path",
-    [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")])
-def test_llama(vllm_runner, example_prompts, model_path, monkeypatch):
+@pytest.mark.parametrize("model_path", [
+    pytest.param(
+        "nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized",
+        id="llama3-eagle3-speculator"),
+    pytest.param(
+        "nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized",
+        id="qwen3-eagle3-speculator"),
+])
+def test_eagle3_speculators_model(vllm_runner, example_prompts, model_path,
+                                  monkeypatch):
+    """
+    Test Eagle3 speculators models properly initialize speculative decoding.
+
+    This test verifies:
+    1. Eagle3 support is detected for the model
+    2. Speculative config is automatically initialized from embedded config
+    3. The draft model path is correctly set to the speculators model
+    4. Speculative tokens count is valid
+    5. Text generation works with speculative decoding enabled
+    """
    # Set environment variable for V1 engine serialization
    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")

    with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model:
+        # Verify Eagle3 support is detected
        eagle3_supported = vllm_model.apply_model(supports_eagle3)
-        assert eagle3_supported
+        assert eagle3_supported, f"Eagle3 should be supported for {model_path}"
+
+        vllm_config = vllm_model.llm.llm_engine.vllm_config
+
+        assert isinstance(vllm_config.speculative_config, SpeculativeConfig), \
+            "Speculative config should be initialized for speculators model"
+
+        spec_config = vllm_config.speculative_config
+        assert spec_config.num_speculative_tokens > 0, \
+            (f"Expected positive speculative tokens, "
+             f"got {spec_config.num_speculative_tokens}")
+
+        assert spec_config.model == model_path, \
+            f"Draft model should be {model_path}, got {spec_config.model}"

        vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                  max_tokens=20)
-        print(vllm_outputs)
-        assert vllm_outputs
-
-
-@pytest.mark.parametrize(
-    "model_path",
-    [("nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized")])
-def test_qwen(vllm_runner, example_prompts, model_path, monkeypatch):
-    # Set environment variable for V1 engine serialization
-    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-
-    with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model:
-        eagle3_supported = vllm_model.apply_model(supports_eagle3)
-        assert eagle3_supported
-
-        vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                  max_tokens=20)
-        print(vllm_outputs)
-        assert vllm_outputs
+        assert vllm_outputs, \
+            f"No outputs generated for speculators model {model_path}"
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@ -57,10 +57,19 @@ def llama_3p2_1b_files():

 def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
    llm_sharded_writer = LLM(model=input_dir, **kwargs)
-
+    # Check which engine version is being used
+    is_v1_engine = hasattr(llm_sharded_writer.llm_engine, "engine_core")
    # Dump worker states to output directory
-    llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
-        path=output_dir)
+    if is_v1_engine:
+        # For V1 engine, we need to use engine_core.save_sharded_state
+        print("Using V1 engine save path")
+        llm_sharded_writer.llm_engine.engine_core.save_sharded_state(
+            path=output_dir)
+    else:
+        # For V0 engine
+        print("Using V0 engine save path")
+        model_executor = llm_sharded_writer.llm_engine.model_executor
+        model_executor.save_sharded_state(path=output_dir)

    # Copy metadata files to output directory
    for file in os.listdir(input_dir):
@ -91,8 +100,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
    gpu_memory_utilization = 0.8
    input_dir = llama_3p2_1b_files
    ctx = mp.get_context("spawn")
-    # The interface in v1 engine has changed, run in v1 engine will hang.
-    monkeypatch.setenv("VLLM_USE_V1", "0")

    # Run in separate processes for memory & CUDA isolation
    with TemporaryDirectory() as output_dir:
@ -100,7 +107,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
                        args=(input_dir, output_dir, weights_patterns),
                        kwargs=dict(
                            tensor_parallel_size=tp_size,
-                            distributed_executor_backend="mp",
                            gpu_memory_utilization=gpu_memory_utilization,
                            enforce_eager=True,
                        ))
@ -112,7 +118,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
        p = ctx.Process(target=_run_generate,
                        args=(input_dir, queue),
                        kwargs=dict(
-                            distributed_executor_backend="mp",
                            enable_lora=enable_lora,
                            gpu_memory_utilization=gpu_memory_utilization,
                            tensor_parallel_size=tp_size,
@ -133,7 +138,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
        p = ctx.Process(target=_run_generate,
                        args=(output_dir, queue),
                        kwargs=dict(
-                            distributed_executor_backend="mp",
                            enable_lora=enable_lora,
                            gpu_memory_utilization=gpu_memory_utilization,
                            tensor_parallel_size=tp_size,
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@ -8,10 +8,7 @@ import pytest
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                          PreTrainedTokenizerFast)

-from vllm.inputs import token_inputs
-from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
-from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.detokenizer import (FastIncrementalDetokenizer,
@ -217,193 +214,3 @@ def test_oov_decode(tokenizer, fast):

    assert decoded_text == ''
    assert out_ids == [len(tokenizer)]
-
-
-@pytest.fixture
-def detokenizer(tokenizer_name: str) -> Detokenizer:
-    tokenizer = get_tokenizer(
-        tokenizer_name,
-        tokenizer_mode="mistral" if "mistral" in tokenizer_name else "auto",
-        trust_remote_code=False,
-        revision=None,
-    )
-
-    return Detokenizer(tokenizer)
-
-
-@pytest.fixture(name="complete_sequence_token_ids")
-def create_complete_sequence_token_ids(complete_sequence: str,
-                                       tokenizer) -> list[int]:
-    return tokenizer(complete_sequence, add_special_tokens=False).input_ids
-
-
-def create_sequence(prompt_token_ids=None):
-    prompt_token_ids = prompt_token_ids or []
-    return Sequence(
-        seq_id=0,
-        inputs=token_inputs(prompt_token_ids),
-        block_size=16,
-    )
-
-
-def create_dummy_logprobs(
-        complete_sequence_token_ids: list[int]) -> list[dict[int, Logprob]]:
-    return [{
-        token_id: Logprob(logprob=0.0),
-        token_id + 1: Logprob(logprob=0.1)
-    } for token_id in complete_sequence_token_ids]
-
-
-def create_dummy_prompt_logprobs(
-        complete_sequence_token_ids: list[int]
-) -> list[Optional[dict[int, Any]]]:
-    # logprob for the first prompt token is None.
-    logprobs: list[Optional[dict[int, Any]]] = [None]
-    logprobs.extend(create_dummy_logprobs(complete_sequence_token_ids)[1:])
-    return logprobs
-
-
-@pytest.mark.parametrize("complete_sequence", TRUTH)
-@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
-@pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True)
-def test_decode_sequence_logprobs(complete_sequence: str,
-                                  complete_sequence_token_ids: list[int],
-                                  detokenizer: Detokenizer,
-                                  skip_special_tokens: bool):
-    """Verify Detokenizer decodes logprobs correctly."""
-    sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
-                                     logprobs=2)
-
-    # Run sequentially.
-    seq = create_sequence()
-    dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids)
-    sequential_logprobs_text_chosen_token: list[str] = []
-    sequential_logprobs_text_other_token: list[str] = []
-    for new_token, logprobs in zip(complete_sequence_token_ids,
-                                   dummy_logprobs):
-        seq.append_token_id(new_token, logprobs)
-        detokenizer.decode_sequence_inplace(seq, sampling_params)
-        sequential_logprobs_text_chosen_token.append(
-            seq.output_logprobs[-1][new_token].decoded_token)
-        sequential_logprobs_text_other_token.append(
-            seq.output_logprobs[-1][new_token + 1].decoded_token)
-    sequential_result = seq.output_text
-
-    assert sequential_result == "".join(sequential_logprobs_text_chosen_token)
-    assert sequential_result != "".join(sequential_logprobs_text_other_token)
-
-    if not skip_special_tokens:
-        # Text for logprobs for the chosen token should be the same as the
-        # generated text. Note that this will only be true if we skip
-        # special tokens.
-        assert sequential_result == complete_sequence
-
-
-@pytest.mark.parametrize("complete_sequence", TRUTH)
-@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
-def test_decode_prompt_logprobs(complete_sequence: str,
-                                complete_sequence_token_ids: list[int],
-                                detokenizer: Detokenizer):
-
-    # We want to use skip_special_tokens=False here but Mistral tokenizers
-    # don't support that.
-    if complete_sequence not in SPECIAL_TOKS_TRUTH:
-        skip_special_tokens = True
-    elif not isinstance(detokenizer.tokenizer, MistralTokenizer):
-        skip_special_tokens = False
-    else:
-        pytest.skip("MistralTokenizers don't support "
-                    "skip_special_tokens=False")
-        return
-    """Verify Detokenizer decodes prompt logprobs correctly."""
-    sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
-                                     prompt_logprobs=1)
-
-    # Run sequentially.
-    seq = create_sequence(complete_sequence_token_ids)
-    seq_group = SequenceGroup(request_id="1",
-                              seqs=[seq],
-                              sampling_params=sampling_params,
-                              arrival_time=0.0)
-    dummy_logprobs = create_dummy_prompt_logprobs(complete_sequence_token_ids)
-    detokenizer.decode_prompt_logprobs_inplace(seq_group,
-                                               dummy_logprobs,
-                                               position_offset=0)
-    # First logprob is None.
-    decoded_prompt_logprobs: list[dict[int, Any]] = dummy_logprobs[
-        1:]  # type: ignore
-
-    # decoded_prompt_logprobs doesn't contain the first token.
-    token_ids = complete_sequence_token_ids
-    tokenizer = detokenizer.tokenizer
-    text_full = tokenizer.decode(token_ids,
-                                 skip_special_tokens=skip_special_tokens)
-    text_first = tokenizer.decode(token_ids[0],
-                                  skip_special_tokens=skip_special_tokens)
-    text = text_full[len(text_first):]
-
-    # Text for logprobs for the chosen token should be the same as the
-    # prompt text. Note that the first logprob is None.
-    assert text == "".join([
-        logprobs[token_id].decoded_token
-        for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
-    ])
-    assert text != "".join([
-        logprobs[token_id + 1].decoded_token
-        for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
-    ])
-
-
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1])
-def test_decode_prompt_logprobs_chunked_prefill(
-    vllm_runner,
-    model,
-    chunked_prefill_token_size: int,
-    example_prompts,
-    monkeypatch,
-):
-    # VLLM V1 does not use incremental detokenization for
-    # prompt logprobs, so this test strategy is irrelevant.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    max_num_seqs = 256
-    enable_chunked_prefill = False
-    max_num_batched_tokens = None
-    if chunked_prefill_token_size != -1:
-        enable_chunked_prefill = True
-        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
-        max_num_batched_tokens = chunked_prefill_token_size
-
-    with vllm_runner(model,
-                     dtype="half",
-                     max_logprobs=5,
-                     gpu_memory_utilization=0.5,
-                     enable_chunked_prefill=enable_chunked_prefill,
-                     max_num_batched_tokens=max_num_batched_tokens,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-
-        vllm_sampling_params = SamplingParams(max_tokens=10,
-                                              logprobs=5,
-                                              prompt_logprobs=5,
-                                              temperature=0.0)
-        vllm_results = vllm_model.llm.generate(
-            example_prompts, sampling_params=vllm_sampling_params)
-
-        for idx, result in enumerate(vllm_results):
-            assert result.prompt_logprobs is not None
-            assert result.prompt_logprobs[0] is None
-
-            # Compared detokenized prompts ids to original prompt.
-            generated_string = ""
-            for (prompt_token,
-                 prompt_logprobs) in zip(result.prompt_token_ids[1:],
-                                         result.prompt_logprobs[1:]):
-                # prompt_logprobs is a dict of the token_id: logprob
-                # We select the token_id corresponding to the actual prompt
-                # Decoded token in the detokenized string corresponding to this
-                # prompt token.
-                generated_string += prompt_logprobs[prompt_token].decoded_token
-
-            assert generated_string == example_prompts[idx], (
-                "Detokenized prompt logprobs do not match original prompt")
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@ -12,7 +12,7 @@ from partial_json_parser.core.options import Allow
 from vllm.entrypoints.openai.protocol import (DeltaMessage, FunctionCall,
                                              ToolCall)
 from vllm.entrypoints.openai.tool_parsers import JambaToolParser
-from vllm.transformers_utils.detokenizer import detokenize_incrementally
+from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer

 MODEL = "ai21labs/Jamba-tiny-dev"
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_use/test_qwen3coder_tool_parser.py
@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              ToolCall)
 from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import (
    Qwen3CoderToolParser)
-from vllm.transformers_utils.detokenizer import detokenize_incrementally
+from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer

 MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaMessage, FunctionCall,
                                              ToolCall)
 from vllm.entrypoints.openai.tool_parsers import SeedOssToolParser
-from vllm.transformers_utils.detokenizer import detokenize_incrementally
+from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer

 # Use a common model that is likely to be available
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@ -11,7 +11,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaMessage, FunctionCall,
                                              ToolCall)
 from vllm.entrypoints.openai.tool_parsers import xLAMToolParser
-from vllm.transformers_utils.detokenizer import detokenize_incrementally
+from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer

 # Use a common model that is likely to be available
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@ -1,15 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for v1 attention backends without GPUModelRunner dependency."""
+from functools import partial
+from typing import Optional, Union

 import pytest
 import torch
+from torch.nn.attention.flex_attention import create_block_mask, flex_attention

 from tests.v1.attention.utils import (BatchSpec, _Backend,
                                      create_common_attn_metadata,
                                      create_standard_kv_cache_spec,
                                      create_vllm_config,
                                      get_attention_backend)
+from vllm.config import ModelConfig
+from vllm.platforms import current_platform
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv, is_torch_equal_or_newer
 from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
                                              set_kv_cache_layout)
@ -183,13 +188,19 @@ class MockAttentionLayer:
        self._v_scale_float = 1.0


-def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
-                          layer_names: list[str], vllm_config,
-                          device: torch.device,
-                          common_attn_metadata: CommonAttentionMetadata,
-                          query: torch.Tensor, key: torch.Tensor,
-                          value: torch.Tensor,
-                          kv_cache: torch.Tensor) -> torch.Tensor:
+def run_attention_backend(
+    backend: _Backend,
+    kv_cache_spec: FullAttentionSpec,
+    layer_names: list[str],
+    vllm_config,
+    device: torch.device,
+    common_attn_metadata: CommonAttentionMetadata,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: torch.Tensor,
+    sliding_window: Optional[int] = None,
+) -> torch.Tensor:
    """Run attention computation using the specified backend's AttentionImpl."""

    # Handle special case for FLEX_ATTENTION_SLOW
@ -253,7 +264,7 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
        scale=scale,
        num_kv_heads=num_kv_heads,
        alibi_slopes=None,
-        sliding_window=None,
+        sliding_window=sliding_window,
        kv_cache_dtype="auto",
    )

@ -275,13 +286,16 @@ def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
    return output


-@pytest.mark.parametrize("batch_spec_name", [
-    "small_decode", "small_prefill", "mixed_small", "medium_decode",
-    "medium_prefill", "mixed_medium", "large_decode", "large_prefill",
-    "single_decode", "single_prefill"
-])
-@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
-def test_backend_correctness(batch_spec_name: str, model: str):
+def _test_backend_correctness(
+    batch_spec: BatchSpec,
+    model: str,
+    backend_to_test: list[Union[_Backend, str]],
+    mask_mod,
+    *,
+    block_size: int = 16,
+    atol: float = 1e-2,
+    rtol: float = 1e-2,
+):
    """
    Test that all backends produce similar outputs to a reference implementation
    using torch.nn.functional.scaled_dot_product_attention.
@ -297,9 +311,10 @@ def test_backend_correctness(batch_spec_name: str, model: str):
       simulated paged KV cache.
    5. Comparing the vLLM backend's output to the ground-truth SDPA output.
    """
-    batch_spec = BATCH_SPECS[batch_spec_name]
+    current_platform.seed_everything(42)
    vllm_config = create_vllm_config(model_name=model,
                                     max_model_len=max(batch_spec.seq_lens),
+                                     block_size=block_size,
                                     num_gpu_blocks=8192)
    device = torch.device("cuda:0")

@ -314,6 +329,7 @@ def test_backend_correctness(batch_spec_name: str, model: str):
    num_kv_heads = vllm_config.model_config.get_num_kv_heads(
        vllm_config.parallel_config)
    head_size = vllm_config.model_config.get_head_size()
+    sliding_window = vllm_config.model_config.get_sliding_window()
    dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype)
    block_size = vllm_config.cache_config.block_size
    scale = 1.0 / (head_size**0.5)
@ -361,22 +377,21 @@ def test_backend_correctness(batch_spec_name: str, model: str):
        # Create causal mask: query token i attends to positions 0 to
        #  (context_len + i)
        kv_len = s_len
-        offset = context_len
-        attn_mask = torch.full((q_len, kv_len),
-                               float('-inf'),
-                               device=device,
-                               dtype=dtype)
-        for i in range(q_len):
-            attn_mask[i, :offset + i + 1] = 0.0

-        sdpa_out_i = torch.nn.functional.scaled_dot_product_attention(
-            q_sdpa_in,
-            k_sdpa_in,
-            v_sdpa_in,
-            attn_mask=attn_mask,
-            scale=scale,
-            enable_gqa=True)
-        # Convert back to (L, H, D)
+        final_mask_mod = partial(mask_mod, context_len=context_len)
+        block_mask = create_block_mask(final_mask_mod,
+                                       B=None,
+                                       H=None,
+                                       Q_LEN=q_len,
+                                       KV_LEN=kv_len,
+                                       device=device)
+        sdpa_out_i = flex_attention(q_sdpa_in,
+                                    k_sdpa_in,
+                                    v_sdpa_in,
+                                    block_mask=block_mask,
+                                    scale=scale,
+                                    enable_gqa=True)
+
        all_sdpa_outputs.append(sdpa_out_i.transpose(1, 2).squeeze(0))

        # Inputs for vLLM backends are just the new tokens
@ -412,7 +427,7 @@ def test_backend_correctness(batch_spec_name: str, model: str):
    # 4. Run vLLM backends and compare
    # Note: flex_attention has known Triton kernel compatibility issues
    # with test infrastructures
-    for backend_name in BACKENDS_TO_TEST:
+    for backend_name in backend_to_test:
        # FlashAttentionm + FlexAttention:
        #   [2, num_blocks, block_size, num_kv_heads, head_size]
        # FlashInfer:
@ -427,12 +442,19 @@ def test_backend_correctness(batch_spec_name: str, model: str):
                2, 3).contiguous().transpose(2, 3)
            set_kv_cache_layout("HND")

-        backend_output = run_attention_backend(backend_name, kv_cache_spec,
-                                               ["placeholder"], vllm_config,
-                                               device, common_attn_metadata,
-                                               query_vllm, key_vllm,
-                                               value_vllm,
-                                               kv_cache_for_backend)
+        backend_output = run_attention_backend(
+            backend_name,
+            kv_cache_spec,
+            ["placeholder"],
+            vllm_config,
+            device,
+            common_attn_metadata,
+            query_vllm,
+            key_vllm,
+            value_vllm,
+            kv_cache_for_backend,
+            sliding_window=sliding_window,
+        )

        # Check shape and dtype consistency
        assert backend_output.shape == sdpa_output.shape, (
@ -446,18 +468,102 @@ def test_backend_correctness(batch_spec_name: str, model: str):
            f"[{backend_name}] produced non-finite values")

        # Check numerical similarity
-        rtol = 1e-2
-        atol = 5e-3
+        def error_msg(msg: str, backend_name: str):
+            return (f"[{backend_name}] output differs from SDPA baseline. "
+                    f"{msg}")

-        max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item()
-        max_rel_diff = torch.max(
-            torch.abs(backend_output - sdpa_output) /
-            torch.abs(sdpa_output)).item()
-        all_close = torch.allclose(backend_output,
+        torch.testing.assert_close(backend_output,
                                   sdpa_output,
                                   rtol=rtol,
-                                   atol=atol)
+                                   atol=atol,
+                                   msg=partial(error_msg,
+                                               backend_name=backend_name))

-        assert all_close, (
-            f"[{backend_name}] output differs from SDPA baseline. "
-            f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})")
+
+@pytest.mark.parametrize("batch_spec_name", [
+    "small_decode", "small_prefill", "mixed_small", "medium_decode",
+    "medium_prefill", "mixed_medium", "large_decode", "large_prefill",
+    "single_decode", "single_prefill"
+])
+@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
+def test_causal_backend_correctness(batch_spec_name: str, model: str):
+    """Test backend's correctness with causal attention."""
+
+    def causal_mask_mod(
+        b: torch.Tensor,
+        h: torch.Tensor,
+        q_idx: torch.Tensor,
+        kv_idx: torch.Tensor,
+        *,
+        context_len: int,
+    ):
+        return (q_idx + context_len) >= kv_idx
+
+    batch_spec = BATCH_SPECS[batch_spec_name]
+    LARGE_BLOCK_BACKENDS = ([_Backend.FLEX_ATTENTION]
+                            if is_torch_equal_or_newer("2.9.0.dev0") else [])
+    SMALL_BLOCK_BACKENDS = [
+        x for x in BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS
+    ]
+    _test_backend_correctness(batch_spec, model, SMALL_BLOCK_BACKENDS,
+                              causal_mask_mod)
+
+    # Fast FlexAttention needs to run with block_size=128
+    if LARGE_BLOCK_BACKENDS:
+        _test_backend_correctness(batch_spec,
+                                  model,
+                                  LARGE_BLOCK_BACKENDS,
+                                  causal_mask_mod,
+                                  block_size=128)
+
+
+SLIDING_WINDOW_BACKENDS_TO_TEST = [
+    _Backend.FLASH_ATTN_VLLM_V1, _Backend.FLEX_ATTENTION,
+    _Backend.TRITON_ATTN_VLLM_V1, "FLEX_ATTENTION_SLOW"
+]
+
+
+@pytest.mark.parametrize("batch_spec_name", [
+    "small_decode", "small_prefill", "mixed_medium", "large_decode",
+    "large_prefill"
+])
+@pytest.mark.parametrize("model", ["microsoft/Phi-tiny-MoE-instruct"])
+def test_sliding_window_backend_correctness(batch_spec_name: str, model: str):
+    """Test backend's correctness with sliding window attention."""
+
+    def sliding_window_mask_mod(
+        b: torch.Tensor,
+        h: torch.Tensor,
+        q_idx: torch.Tensor,
+        kv_idx: torch.Tensor,
+        *,
+        context_len: int,
+        sliding_window: int,
+    ):
+        causal_mask = q_idx + context_len >= kv_idx
+        window_mask = q_idx + context_len - kv_idx < sliding_window
+        return causal_mask & window_mask
+
+    batch_spec = BATCH_SPECS[batch_spec_name]
+    model_config = ModelConfig(model=model,
+                               max_model_len=max(batch_spec.seq_lens))
+    sliding_window = model_config.get_sliding_window()
+    sliding_window_mask_mod_fn = partial(sliding_window_mask_mod,
+                                         sliding_window=sliding_window)
+
+    LARGE_BLOCK_BACKENDS = ([_Backend.FLEX_ATTENTION]
+                            if is_torch_equal_or_newer("2.9.0.dev0") else [])
+    SMALL_BLOCK_BACKENDS = [
+        x for x in SLIDING_WINDOW_BACKENDS_TO_TEST
+        if x not in LARGE_BLOCK_BACKENDS
+    ]
+    _test_backend_correctness(batch_spec, model, SMALL_BLOCK_BACKENDS,
+                              sliding_window_mask_mod_fn)
+
+    # Fast FlexAttention needs to run with block_size=128
+    if LARGE_BLOCK_BACKENDS:
+        _test_backend_correctness(batch_spec,
+                                  model,
+                                  LARGE_BLOCK_BACKENDS,
+                                  sliding_window_mask_mod_fn,
+                                  block_size=128)
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@ -12,9 +12,9 @@ from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
                                   STOP_STRINGS,
                                   DummyOutputProcessorTestVectors,
                                   MockEngineCore)
+from vllm.logprobs import PromptLogprobs, SampleLogprobs
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.sequence import PromptLogprobs, SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.output_processor import (OutputProcessor,
--- a/tests/v1/engine/test_processor_multi_modal_uuids.py
+++ b/tests/v1/engine/test_processor_multi_modal_uuids.py
@ -6,7 +6,6 @@ import pytest
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
-from vllm.platforms.interface import UnspecifiedPlatform
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import processor as processor_mod
 from vllm.v1.engine.processor import Processor
@ -33,15 +32,6 @@ def _mk_processor(monkeypatch,
                        "__post_init__",
                        lambda self, *args: None,
                        raising=True)
-    monkeypatch.setattr(UnspecifiedPlatform,
-                        "is_async_output_supported",
-                        classmethod(lambda cls, enforce_eager: True),
-                        raising=True)
-    monkeypatch.setattr(
-        ModelConfig,
-        "verify_async_output_proc",
-        lambda self, parallel_config, speculative_config, device_config: None,
-        raising=True)
    monkeypatch.setattr(ModelConfig,
                        "verify_with_parallel_config",
                        lambda self, parallel_config: None,
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@ -29,24 +29,6 @@ def test_unsupported_configs(monkeypatch):
                },
            ).create_engine_config()

-        with pytest.raises(NotImplementedError):
-            AsyncEngineArgs(
-                model=MODEL,
-                preemption_mode="swap",
-            ).create_engine_config()
-
-        with pytest.raises(NotImplementedError):
-            AsyncEngineArgs(
-                model=MODEL,
-                disable_async_output_proc=True,
-            ).create_engine_config()
-
-        with pytest.raises(NotImplementedError):
-            AsyncEngineArgs(
-                model=MODEL,
-                scheduler_delay_factor=1.2,
-            ).create_engine_config()
-

 def test_enable_by_default_fallback(monkeypatch):
    with monkeypatch.context() as m:
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@ -4,19 +4,14 @@
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from dataclasses import dataclass, fields
-from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional,
-                    Protocol, Set, Tuple, Type, TypeVar)
+from typing import (Any, Dict, Generic, List, Optional, Protocol, Set, Tuple,
+                    Type, TypeVar)

 import torch

 from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
 from vllm.multimodal import MultiModalPlaceholderMap

-if TYPE_CHECKING:
-    from vllm.worker.model_runner_base import (ModelRunnerBase,
-                                               ModelRunnerInputBase,
-                                               ModelRunnerInputBuilderBase)
-

 class AttentionType:
    """
@ -170,7 +165,7 @@ class AttentionState(ABC, Generic[T]):
    lifetime of the model runner."""

    @abstractmethod
-    def __init__(self, runner: "ModelRunnerBase"):
+    def __init__(self, runner: Any):
        ...

    @abstractmethod
@ -210,7 +205,7 @@ class AttentionState(ABC, Generic[T]):
        ...

    @abstractmethod
-    def begin_forward(self, model_input: "ModelRunnerInputBase") -> None:
+    def begin_forward(self, model_input) -> None:
        """Prepare state for forward pass."""
        ...

@ -219,7 +214,7 @@ class AttentionMetadataBuilder(ABC, Generic[T]):
    """Abstract class for attention metadata builders."""

    @abstractmethod
-    def __init__(self, input_builder: "ModelRunnerInputBuilderBase") -> None:
+    def __init__(self, input_builder) -> None:
        """Create the builder, remember some configuration and parameters."""
        raise NotImplementedError

--- a/vllm/attention/backends/differential_flash_attn.py
+++ b/vllm/attention/backends/differential_flash_attn.py
@ -4,7 +4,7 @@
 from collections import defaultdict
 from dataclasses import dataclass
 from itertools import accumulate
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+from typing import Any, Dict, List, Optional, Tuple, Type

 import torch
 from einops import rearrange
@ -34,9 +34,6 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 from vllm.vllm_flash_attn import (flash_attn_varlen_func,
                                  flash_attn_with_kvcache)

-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
-
 logger = init_logger(__name__)


@ -329,7 +326,7 @@ class DifferentialFlashAttentionMetadata(AttentionMetadata):
 class DifferentialFlashAttentionMetadataBuilder(
        AttentionMetadataBuilder[DifferentialFlashAttentionMetadata]):

-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+    def __init__(self, input_builder):
        self.input_builder = input_builder
        self.runner = input_builder.runner
        self.sliding_window = input_builder.sliding_window
@ -350,9 +347,8 @@ class DifferentialFlashAttentionMetadataBuilder(
        self.num_decode_tokens = 0
        self.has_prefix_cache_hit = False

-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
+                       prefix_cache_hit: bool):
        """Add a sequence group to the metadata. Specifically update/append
        1. context length.
        2. block table.
--- a/vllm/attention/backends/dual_chunk_flash_attn.py
+++ b/vllm/attention/backends/dual_chunk_flash_attn.py
@ -4,7 +4,7 @@
 """
 import math
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+from typing import Any, Dict, List, Optional, Tuple, Type

 import torch
 import torch.distributed
@ -22,9 +22,6 @@ from vllm.utils import async_tensor_h2d
 from vllm.vllm_flash_attn import (flash_attn_varlen_func,
                                  flash_attn_with_kvcache, sparse_attn_func)

-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
-
 logger = init_logger(__name__)


@ -224,9 +221,8 @@ class DualChunkFlashAttentionMetadataBuilder(FlashAttentionMetadataBuilder):
        super().prepare()
        self.orig_seq_lens: List[int] = []

-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
+                       prefix_cache_hit: bool):
        super()._add_seq_group(inter_data, chunked_prefill_enabled,
                               prefix_cache_hit)
        for prompt_len, seq_len in zip(inter_data.prompt_lens,
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@ -4,7 +4,7 @@
 from collections import defaultdict
 from dataclasses import dataclass
 from itertools import accumulate
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
+from typing import Dict, List, Optional, Tuple, Type

 import torch

@ -31,9 +31,6 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 from vllm.vllm_flash_attn import (flash_attn_varlen_func,
                                  flash_attn_with_kvcache)

-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
-
 logger = init_logger(__name__)


@ -312,7 +309,7 @@ class FlashAttentionMetadata(AttentionMetadata):
 class FlashAttentionMetadataBuilder(
        AttentionMetadataBuilder[FlashAttentionMetadata]):

-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+    def __init__(self, input_builder):
        self.input_builder = input_builder
        self.runner = input_builder.runner
        self.sliding_window = input_builder.sliding_window
@ -332,9 +329,8 @@ class FlashAttentionMetadataBuilder(
        self.num_decode_tokens = 0
        self.has_prefix_cache_hit = False

-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
+                       prefix_cache_hit: bool):
        """Add a sequence group to the metadata. Specifically update/append
        1. context length.
        2. block table.
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@ -193,8 +193,7 @@ from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
 from itertools import accumulate
-from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple,
-                    Type, TypeVar)
+from typing import Any, Dict, Generic, List, Optional, Tuple, Type, TypeVar

 import torch

@ -233,9 +232,6 @@ except ImportError:
    except ImportError:
        flash_attn_varlen_func = None

-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
-
 is_hip = current_platform.is_rocm()


@ -638,7 +634,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
    """
    BLOCK_TABLE_EXTENDER: list[list[int]] = []

-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+    def __init__(self, input_builder):
        self.input_builder = input_builder
        self.runner = input_builder.runner
        self.sliding_window = input_builder.sliding_window
@ -668,9 +664,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
        self.num_decode_tokens = 0
        self.has_prefix_cache_hit = False

-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool,
+                       prefix_cache_hit: bool):
        """Add a sequence group to the metadata. Specifically update/append
        1. context length.
        2. block table.
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@ -4,7 +4,7 @@
 from collections import defaultdict
 from dataclasses import dataclass
 from itertools import accumulate
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
+from typing import Dict, List, Optional, Tuple, Type

 import torch

@ -13,9 +13,6 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                              AttentionMetadataBuilder)
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.multimodal import MultiModalPlaceholderMap
-
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import (ModelInputForGPUBuilder)
 from vllm.utils import async_tensor_h2d

 # Placeholder attention backend for models like Mamba and pooling models that
@ -204,7 +201,7 @@ class PlaceholderAttentionMetadata(AttentionMetadata):
 class PlaceholderAttentionMetadataBuilder(
        AttentionMetadataBuilder[PlaceholderAttentionMetadata]):

-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+    def __init__(self, input_builder):

        self.input_builder = input_builder
        self.runner = input_builder.runner
@ -220,9 +217,7 @@ class PlaceholderAttentionMetadataBuilder(
        self.num_prefill_tokens = 0
        self.num_decode_tokens = 0

-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool):
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool):
        """Add a sequence group to the metadata. Specifically update/append
        1. context length.
        """
--- a/vllm/attention/backends/rocm_aiter_mla.py
+++ b/vllm/attention/backends/rocm_aiter_mla.py
@ -3,7 +3,7 @@

 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional, Type, Union
+from typing import Optional, Type, Union

 import torch

@ -19,9 +19,6 @@ from vllm.attention.backends.utils import (compute_slot_mapping,
 from vllm.attention.ops.rocm_aiter_mla import (aiter_mla_decode_fwd,
                                               get_aiter_mla_metadata)

-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
-

 def is_aiter_mla_enabled() -> bool:
    return envs.VLLM_ROCM_USE_AITER \
@ -110,7 +107,7 @@ class AiterMLAMetadata(MLACommonMetadata):
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
    BLOCK_TABLE_EXTENDER: list[list[int]] = [[]]

-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+    def __init__(self, input_builder):
        super().__init__(input_builder)
        assert self.block_size == 1, "AITER MLA requires only block size 1."

--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@ -5,8 +5,7 @@ from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
 from itertools import accumulate
-from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
-                    TypeVar, Union)
+from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union

 import numpy as np
 import torch
@ -21,9 +20,6 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad

 logger = init_logger(__name__)

-if TYPE_CHECKING:
-    from vllm.worker.model_runner_base import ModelRunnerBase
-
 # Error string(s) for encoder/decoder
 # unsupported attention scenarios
 STR_NOT_IMPL_ENC_DEC_ROCM_HIP = ("ROCm/HIP is not currently supported "
@ -35,9 +31,6 @@ PAD_SLOT_ID = -1
 # if we have at least this many elements. Could be tuned further.
 _COMPUTE_SLOT_MAPPING_NUMPY_NUMEL = 256

-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
-

 def is_block_tables_empty(block_tables: Union[None, Dict]):
    """
@ -129,7 +122,7 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]):

    _metadata_cls: Type[TAttentionMetadata]

-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+    def __init__(self, input_builder):
        self.input_builder = input_builder
        self.runner = input_builder.runner

@ -149,9 +142,7 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]):
        self.num_prefill_tokens = 0
        self.num_decode_tokens = 0

-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool):
+    def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool):
        is_prompt = inter_data.is_prompt
        block_tables = inter_data.block_tables

@ -291,7 +282,7 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]):

 class CommonAttentionState(AttentionState):

-    def __init__(self, runner: "ModelRunnerBase"):
+    def __init__(self, runner):
        self.runner = runner
        self._is_graph_capturing = False

--- a/vllm/config/init.py
+++ b/vllm/config/init.py
@ -454,9 +454,6 @@ class VllmConfig:
        self.try_verify_and_update_config()

        if self.model_config is not None:
-            self.model_config.verify_async_output_proc(self.parallel_config,
-                                                       self.speculative_config,
-                                                       self.device_config)
            self.model_config.verify_with_parallel_config(self.parallel_config)
            self.model_config.verify_dual_chunk_attention_config(
                self.load_config)
@ -877,7 +874,6 @@ class VllmConfig:
            f"served_model_name={self.model_config.served_model_name}, "
            f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
            f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
-            f"use_async_output_proc={self.model_config.use_async_output_proc}, "
            f"pooler_config={self.model_config.pooler_config!r}, "
            f"compilation_config={self.compilation_config!r}")

--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@ -27,8 +27,7 @@ from vllm.transformers_utils.config import (
    ConfigFormat, get_config, get_hf_image_processor_config,
    get_hf_text_config, get_pooling_config,
    get_sentence_transformer_tokenizer_config, is_encoder_decoder,
-    is_interleaved, maybe_override_with_speculators_target_model,
-    try_get_generation_config, try_get_safetensors_metadata,
+    is_interleaved, try_get_generation_config, try_get_safetensors_metadata,
    try_get_tokenizer_config, uses_mrope)
 from vllm.transformers_utils.runai_utils import (ObjectStorageModel,
                                                 is_runai_obj_uri)
@ -223,8 +222,6 @@ class ModelConfig:
    that this name(s) will also be used in `model_name` tag content of
    prometheus metrics, if multiple names provided, metrics tag will take the
    first one."""
-    use_async_output_proc: bool = True
-    """Whether to use async output processor."""
    config_format: Union[str, ConfigFormat] = "auto"
    """The format of the model config to load:\n
    - "auto" will try to load the config in hf format if available else it
@ -418,15 +415,6 @@ class ModelConfig:

        self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)

-        if self.runner != "draft":
-            # If we're not running the draft model, check for speculators config
-            # If speculators config, set model / tokenizer to be target model
-            self.model, self.tokenizer = maybe_override_with_speculators_target_model(  # noqa: E501
-                model=self.model,
-                tokenizer=self.tokenizer,
-                revision=self.revision,
-                trust_remote_code=self.trust_remote_code)
-
        if (backend := envs.VLLM_ATTENTION_BACKEND
            ) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
            raise ValueError(
@ -1119,37 +1107,6 @@ class ModelConfig:
                raise ValueError("please set VLLM_ATTENTION_BACKEND to "
                                 f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}")

-    def verify_async_output_proc(self, parallel_config, speculative_config,
-                                 device_config) -> None:
-        if not self.use_async_output_proc:
-            # Nothing to check
-            return
-
-        if parallel_config.pipeline_parallel_size > 1:
-            self.use_async_output_proc = False
-            return
-
-        # Reminder: Please update docs/features/compatibility_matrix.md
-        # If the feature combo become valid
-        from vllm.platforms import current_platform
-        if not current_platform.is_async_output_supported(self.enforce_eager):
-            self.use_async_output_proc = False
-            return
-
-        if envs.VLLM_USE_RAY_SPMD_WORKER:
-            self.use_async_output_proc = False
-            return
-
-        # Async postprocessor is not necessary for pooling models
-        # since there is no token generation
-        if self.runner_type == "pooling":
-            self.use_async_output_proc = False
-
-        # Reminder: Please update docs/features/compatibility_matrix.md
-        # If the feature combo become valid
-        if speculative_config:
-            self.use_async_output_proc = False
-
    def verify_with_parallel_config(
        self,
        parallel_config: ParallelConfig,
@ -1173,15 +1130,12 @@ class ModelConfig:
            self._verify_with_expert_parallelism()

        pipeline_parallel_size = parallel_config.pipeline_parallel_size
-        if pipeline_parallel_size > 1:
-            if not self.registry.is_pp_supported_model(self.architectures,
-                                                       self):
-                raise NotImplementedError(
-                    "Pipeline parallelism is not supported for this model. "
-                    "Supported models implement the `SupportsPP` interface.")
-
-            if self.use_async_output_proc:
-                self.use_async_output_proc = False
+        if (pipeline_parallel_size > 1
+                and not self.registry.is_pp_supported_model(
+                    self.architectures, self)):
+            raise NotImplementedError(
+                "Pipeline parallelism is not supported for this model. "
+                "Supported models implement the `SupportsPP` interface.")

    def get_sliding_window(self) -> Optional[int]:
        """Get the sliding window size from the HF text config if present."""
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@ -3,7 +3,7 @@

 import hashlib
 from dataclasses import field
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal, Union

 from pydantic import SkipValidation, model_validator
 from pydantic.dataclasses import dataclass
@ -18,7 +18,6 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
 logger = init_logger(__name__)

 RunnerType = Literal["generate", "pooling", "draft"]
-PreemptionMode = Literal["swap", "recompute"]
 SchedulerPolicy = Literal["fcfs", "priority"]


@ -78,10 +77,6 @@ class SchedulerConfig:
    3. more than one value (e.g. 1 2 128) is provided, then the capture list
    will follow the provided list."""

-    delay_factor: float = 0.0
-    """Apply a delay (of delay factor multiplied by previous
-    prompt latency) before scheduling next prompt."""
-
    enable_chunked_prefill: SkipValidation[bool] = None  # type: ignore
    """If True, prefill requests can be chunked based
    on the remaining max_num_batched_tokens."""
@ -103,14 +98,6 @@ class SchedulerConfig:
    NOTE: This is not currently configurable. It will be overridden by
    max_num_batched_tokens in case max multimodal embedding size is larger."""

-    preemption_mode: Optional[PreemptionMode] = None
-    """Whether to perform preemption by swapping or
-    recomputation. If not specified, we determine the mode as follows:
-    We use recomputation by default since it incurs lower overhead than
-    swapping. However, when the sequence group has multiple sequences
-    (e.g., beam search), recomputation is not currently supported. In
-    such a case, we use swapping instead."""
-
    send_delta_data: bool = False
    """Private API. If used, scheduler sends delta data to
    workers instead of an entire data. It should be enabled only
--- a/vllm/core/init.py
+++ b/vllm/core/init.py
--- a/vllm/core/block/init.py
+++ b/vllm/core/block/init.py
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@ -1,399 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import math
-from typing import List, Optional
-
-from vllm.core.block.common import BlockList
-from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator
-from vllm.utils import Device, cdiv, chunk_list
-
-
-class BlockTable:
-    """A class to manage blocks for a specific sequence.
-
-    The BlockTable maps a sequence of tokens to a list of blocks, where each
-    block represents a contiguous memory allocation for a portion of the 
-    sequence. The blocks are managed by a DeviceAwareBlockAllocator, which is
-    responsible for allocating and freeing memory for the blocks.
-
-    Args:
-        block_size (int): The maximum number of tokens that can be stored in a
-            single block.
-        block_allocator (DeviceAwareBlockAllocator): The block allocator used to
-            manage memory for the blocks.
-        _blocks (Optional[List[Block]], optional): An optional list of existing
-            blocks to initialize the BlockTable with. If not provided, an empty
-            BlockTable is created.
-        max_block_sliding_window (Optional[int], optional): The number of
-            blocks to keep around for each sequence. If None, all blocks
-            are kept (eg., when sliding window is not used).
-            It should at least fit the sliding window size of the model.
-
-    Attributes:
-        _block_size (int): The maximum number of tokens that can be stored in a
-            single block.
-        _allocator (DeviceAwareBlockAllocator): The block allocator used to
-            manage memory for the blocks.
-        _blocks (Optional[List[Block]]): The list of blocks managed by this
-            BlockTable.
-        _num_full_slots (int): The number of tokens currently stored in the
-            blocks.
-    """
-
-    def __init__(
-        self,
-        block_size: int,
-        block_allocator: DeviceAwareBlockAllocator,
-        _blocks: Optional[List[Block]] = None,
-        max_block_sliding_window: Optional[int] = None,
-    ):
-        self._block_size = block_size
-        self._allocator = block_allocator
-        if _blocks is None:
-            _blocks = []
-        self._blocks: BlockList = BlockList(_blocks)
-
-        self._max_block_sliding_window = max_block_sliding_window
-        self._num_full_slots = self._get_num_token_ids()
-
-    @staticmethod
-    def get_num_required_blocks(token_ids: List[int],
-                                block_size: int,
-                                num_lookahead_slots: int = 0) -> int:
-        """Calculates the minimum number of blocks required to store a given
-        sequence of token IDs along with any look-ahead slots that may be
-        required (like in multi-step + chunked-prefill).
-
-        This assumes worst-case scenario, where every block requires a new
-        allocation (e.g. ignoring prefix caching).
-
-        Args:
-            token_ids (List[int]): The sequence of token IDs to be stored.
-            block_size (int): The maximum number of tokens that can be stored in
-                a single block.
-            num_lookahead_slots (int): look-ahead slots that the sequence may
-                require.
-
-        Returns:
-            int: The minimum number of blocks required to store the given
-                sequence of token IDs along with any required look-ahead slots.
-        """
-        return cdiv(len(token_ids) + num_lookahead_slots, block_size)
-
-    def allocate(self,
-                 token_ids: List[int],
-                 device: Device = Device.GPU,
-                 extra_hash: Optional[int] = None) -> None:
-        """Allocates memory blocks for storing the given sequence of token IDs.
-
-        This method allocates the required number of blocks to store the given
-        sequence of token IDs.
-
-        Args:
-            token_ids (List[int]): The sequence of token IDs to be stored.
-            device (Device, optional): The device on which the blocks should be
-                allocated. Defaults to Device.GPU.
-            extra_hash (Optional[int]): The hash value of additional
-                factors, such as adapters, that influence the block hash
-                in the prefixcaching block.
-        """
-        assert not self._is_allocated
-        assert token_ids
-        blocks = self._allocate_blocks_for_token_ids(prev_block=None,
-                                                     token_ids=token_ids,
-                                                     device=device,
-                                                     extra_hash=extra_hash)
-        self.update(blocks)
-        self._num_full_slots = len(token_ids)
-
-    def update(self, blocks: List[Block]) -> None:
-        """Resets the table to the newly provided blocks 
-        (with their corresponding block ids)
-        """
-        self._blocks.update(blocks)
-
-    def append_token_ids(self,
-                         token_ids: List[int],
-                         num_lookahead_slots: int = 0,
-                         num_computed_slots: Optional[int] = None,
-                         extra_hash: Optional[int] = None) -> None:
-        """Appends a sequence of token IDs to the existing blocks in the
-        BlockTable.
-
-        This method appends the given sequence of token IDs to the existing
-        blocks in the BlockTable. If there is not enough space in the existing
-        blocks, new blocks are allocated using the `ensure_num_empty_slots`
-        method to accommodate the additional tokens.
-
-        The token IDs are divided into chunks of size `block_size` (except for
-        the first chunk, which may be smaller), and each chunk is appended to a
-        separate block.
-
-        Args:
-            token_ids (List[int]): The sequence of token IDs to be appended.
-            num_computed_slots (Optional[int]): The number of KV cache slots
-                that are already filled (computed).
-                When sliding window is enabled, this is used to compute how many
-                blocks to drop at the front of the sequence.
-                Without sliding window, None can be passed.
-                Without chunked prefill, it should be the same as
-                _num_full_slots.
-            extra_hash (Optional[int]): The hash value of additional
-                factors such as adapters that influence the block, apart
-                from the token_ids.
-        """
-        assert self._is_allocated, "no blocks have been allocated"
-        assert len(self._blocks) > 0
-
-        # Drop blocks that are no longer needed due to sliding window
-        if self._max_block_sliding_window is not None:
-            null_block = self._allocator.allocate_or_get_null_block()
-            assert num_computed_slots is not None
-            end_block_idx = (num_computed_slots //
-                             self._block_size) - self._max_block_sliding_window
-            for idx in range(0, end_block_idx):
-                b = self._blocks[idx]
-                if b is not null_block:
-                    self._allocator.free(b)
-                    self._blocks[idx] = null_block
-
-        # Ensure there are enough empty slots for the new tokens plus
-        # lookahead slots
-        self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
-                                    num_lookahead_slots,
-                                    extra_hash=extra_hash)
-
-        # Update the blocks with the new tokens
-        first_block_idx = self._num_full_slots // self._block_size
-        token_blocks = self._chunk_token_blocks_for_append(token_ids)
-
-        for i, token_block in enumerate(token_blocks):
-            self._blocks.append_token_ids(first_block_idx + i, token_block)
-
-        self._num_full_slots += len(token_ids)
-
-    def ensure_num_empty_slots(self,
-                               num_empty_slots: int,
-                               extra_hash: Optional[int] = None) -> None:
-        """Ensures that the BlockTable has at least the specified number of
-        empty slots available.
-
-        This method checks if the BlockTable has enough empty slots (i.e.,
-        available space) to accommodate the requested number of tokens. If not,
-        it allocates additional blocks on the GPU to ensure that the required
-        number of empty slots is available.
-
-        Args:
-            num_empty_slots (int): The minimum number of empty slots required.
-            extra_hash (Optional[int]): The hash value of additional
-                factors such as adapters that influence the block, apart
-                from the token_ids.
-        """
-        # Currently the block table only supports
-        # appending tokens to GPU blocks.
-        device = Device.GPU
-        assert self._is_allocated
-
-        if self._num_empty_slots >= num_empty_slots:
-            return
-
-        slots_to_allocate = num_empty_slots - self._num_empty_slots
-        blocks_to_allocate = cdiv(slots_to_allocate, self._block_size)
-
-        for _ in range(blocks_to_allocate):
-            assert len(self._blocks) > 0
-            self._blocks.append(
-                self._allocator.allocate_mutable_block(
-                    prev_block=self._blocks[-1],
-                    device=device,
-                    extra_hash=extra_hash))
-
-    def fork(self) -> "BlockTable":
-        """Creates a new BlockTable instance with a copy of the blocks from the
-        current instance.
-
-        This method creates a new BlockTable instance with the same block size,
-        block allocator, and a copy of the blocks from the current instance. The
-        new BlockTable has its own independent set of blocks, but shares the
-        same underlying memory allocation with the original BlockTable.
-
-        Returns:
-            BlockTable: A new BlockTable instance with a copy of the blocks from
-                the current instance.
-        """
-        assert self._is_allocated
-        assert len(self._blocks) > 0
-        forked_blocks = self._allocator.fork(self._blocks[-1])
-        return BlockTable(
-            block_size=self._block_size,
-            block_allocator=self._allocator,
-            _blocks=forked_blocks,
-            max_block_sliding_window=self._max_block_sliding_window,
-        )
-
-    def free(self) -> None:
-        """Frees the memory occupied by the blocks in the BlockTable.
-
-        This method iterates over all the blocks in the `_blocks` list and calls
-        the `free` method of the `_allocator` object to release the memory
-        occupied by each block. After freeing all the blocks, the `_blocks` list
-        is set to `None`.
-        """
-        for block in self.blocks:
-            self._allocator.free(block)
-        self._blocks.reset()
-
-    @property
-    def physical_block_ids(self) -> List[int]:
-        """Returns a list of physical block indices for the blocks in the
-        BlockTable.
-
-        This property returns a list of integers, where each integer represents
-        the physical block index of a corresponding block in the `_blocks` list.
-        The physical block index is a unique identifier for the memory location
-        occupied by the block.
-
-        Returns:
-            List[int]: A list of physical block indices for the blocks in the
-                BlockTable.
-        """
-        return self._blocks.ids()
-
-    def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
-        """Get the number of "unseen" tokens in the sequence.
-
-        Unseen tokens are tokens in the sequence corresponding to this block
-        table, but are not yet appended to this block table.
-
-        Args:
-            sequence_token_ids (List[int]): The list of token ids in the
-                sequence.
-
-        Returns:
-            List[int]: The postfix of sequence_token_ids that has not yet been
-                appended to the block table.
-        """
-
-        # Since the block table is append-only, the unseen token ids are the
-        # ones after the appended ones.
-        return sequence_token_ids[self.num_full_slots:]
-
-    def _allocate_blocks_for_token_ids(
-            self,
-            prev_block: Optional[Block],
-            token_ids: List[int],
-            device: Device,
-            extra_hash: Optional[int] = None) -> List[Block]:
-        blocks: List[Block] = []
-
-        block_token_ids = []
-        tail_token_ids = []
-        for cur_token_ids in chunk_list(token_ids, self._block_size):
-            if len(cur_token_ids) == self._block_size:
-                block_token_ids.append(cur_token_ids)
-            else:
-                tail_token_ids.append(cur_token_ids)
-
-        if block_token_ids:
-            blocks.extend(
-                self._allocator.allocate_immutable_blocks(
-                    prev_block,
-                    block_token_ids=block_token_ids,
-                    device=device,
-                    extra_hash=extra_hash))
-            prev_block = blocks[-1]
-
-        if tail_token_ids:
-            assert len(tail_token_ids) == 1
-            cur_token_ids = tail_token_ids[0]
-
-            block = self._allocator.allocate_mutable_block(
-                prev_block=prev_block, device=device, extra_hash=extra_hash)
-            block.append_token_ids(cur_token_ids)
-
-            blocks.append(block)
-
-        return blocks
-
-    def _get_all_token_ids(self) -> List[int]:
-        # NOTE: This function is O(seq_len); use sparingly.
-        token_ids: List[int] = []
-
-        if not self._is_allocated:
-            return token_ids
-
-        for block in self.blocks:
-            token_ids.extend(block.token_ids)
-
-        return token_ids
-
-    def _get_num_token_ids(self) -> int:
-        res = 0
-        for block in self.blocks:
-            res += len(block.token_ids)
-
-        return res
-
-    @property
-    def _is_allocated(self) -> bool:
-        return len(self._blocks) > 0
-
-    @property
-    def blocks(self) -> List[Block]:
-        return self._blocks.list()
-
-    @property
-    def _num_empty_slots(self) -> int:
-        assert self._is_allocated
-        return len(self._blocks) * self._block_size - self._num_full_slots
-
-    @property
-    def num_full_slots(self) -> int:
-        """Returns the total number of tokens currently stored in the
-        BlockTable.
-
-        Returns:
-            int: The total number of tokens currently stored in the BlockTable.
-        """
-        return self._num_full_slots
-
-    def get_num_blocks_touched_by_append_slots(
-            self, token_ids: List[int], num_lookahead_slots: int) -> int:
-        """Determine how many blocks will be "touched" by appending the token
-        ids.
-
-        This is required for the scheduler to determine whether a sequence can
-        continue generation, or if it must be preempted.
-        """
-        # Math below is equivalent to:
-        # all_token_ids = token_ids + [-1] * num_lookahead_slots
-        # token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
-        # return len(token_blocks)
-
-        num_token_ids = len(token_ids) + num_lookahead_slots
-        first_chunk_size = self._block_size - (self._num_full_slots %
-                                               self._block_size)
-        num_token_blocks = (1 + math.ceil(
-            (num_token_ids - first_chunk_size) / self._block_size))
-        return num_token_blocks
-
-    def _chunk_token_blocks_for_append(
-            self, token_ids: List[int]) -> List[List[int]]:
-        """Split the token ids into block-sized chunks so they can be easily
-        appended to blocks. The first such "token block" may have less token ids
-        than the block size, since the last allocated block may be partially
-        full.
-
-        If no token ids are provided, then no chunks are returned.
-        """
-
-        if not token_ids:
-            return []
-
-        first_chunk_size = self._block_size - (self._num_full_slots %
-                                               self._block_size)
-        token_blocks = [token_ids[:first_chunk_size]]
-        token_blocks.extend(
-            chunk_list(token_ids[first_chunk_size:], self._block_size))
-        return token_blocks
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@ -1,371 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from collections import deque
-from dataclasses import dataclass
-from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
-
-from vllm.core.block.interfaces import Block, BlockAllocator
-
-BlockId = int
-RefCount = int
-
-
-class RefCounterProtocol(Protocol):
-
-    def incr(self, block_id: BlockId) -> RefCount:
-        raise NotImplementedError
-
-    def decr(self, block_id: BlockId) -> RefCount:
-        raise NotImplementedError
-
-    def get(self, block_id: BlockId) -> RefCount:
-        raise NotImplementedError
-
-
-class RefCounter(RefCounterProtocol):
-    """A class for managing reference counts for a set of block indices.
-
-    The RefCounter class maintains a dictionary that maps block indices to their
-    corresponding reference counts. It provides methods to increment, decrement,
-    and retrieve the reference count for a given block index.
-
-    Args:
-        all_block_indices (Iterable[BlockId]): An iterable of block indices
-            to initialize the reference counter with.
-    """
-
-    def __init__(self, all_block_indices: Iterable[BlockId]):
-        deduped = set(all_block_indices)
-        self._refcounts: Dict[BlockId, RefCount] = {
-            index: 0
-            for index in deduped
-        }
-
-    def incr(self, block_id: BlockId) -> RefCount:
-        assert block_id in self._refcounts
-        pre_incr_refcount = self._refcounts[block_id]
-
-        assert pre_incr_refcount >= 0
-
-        post_incr_refcount = pre_incr_refcount + 1
-        self._refcounts[block_id] = post_incr_refcount
-        return post_incr_refcount
-
-    def decr(self, block_id: BlockId) -> RefCount:
-        assert block_id in self._refcounts
-        refcount = self._refcounts[block_id]
-
-        assert refcount > 0
-        refcount -= 1
-
-        self._refcounts[block_id] = refcount
-
-        return refcount
-
-    def get(self, block_id: BlockId) -> RefCount:
-        assert block_id in self._refcounts
-        return self._refcounts[block_id]
-
-    def as_readonly(self) -> "ReadOnlyRefCounter":
-        return ReadOnlyRefCounter(self)
-
-
-class ReadOnlyRefCounter(RefCounterProtocol):
-    """A read-only view of the RefCounter class.
-
-    The ReadOnlyRefCounter class provides a read-only interface to access the
-    reference counts maintained by a RefCounter instance. It does not allow
-    modifications to the reference counts.
-
-    Args:
-        refcounter (RefCounter): The RefCounter instance to create a read-only
-            view for.
-    """
-
-    def __init__(self, refcounter: RefCounter):
-        self._refcounter = refcounter
-
-    def incr(self, block_id: BlockId) -> RefCount:
-        raise ValueError("Incr not allowed")
-
-    def decr(self, block_id: BlockId) -> RefCount:
-        raise ValueError("Decr not allowed")
-
-    def get(self, block_id: BlockId) -> RefCount:
-        return self._refcounter.get(block_id)
-
-
-class CopyOnWriteTracker:
-    """A class for tracking and managing copy-on-write operations for blocks.
-
-    The CopyOnWriteTracker class maintains a mapping of source block indices to
-        their corresponding copy-on-write destination block indices. It works in
-        conjunction with a RefCounter.
-
-    Args:
-        refcounter (RefCounter): The reference counter used to track block
-            reference counts.
-    """
-
-    def __init__(self, refcounter: RefCounterProtocol):
-        self._copy_on_writes: List[Tuple[BlockId, BlockId]] = []
-        self._refcounter = refcounter
-
-    def is_appendable(self, block: Block) -> bool:
-        """Checks if the block is shared or not. If shared, then it cannot
-        be appended and needs to be duplicated via copy-on-write
-        """
-        block_id = block.block_id
-        if block_id is None:
-            return True
-
-        refcount = self._refcounter.get(block_id)
-        return refcount <= 1
-
-    def record_cow(self, src_block_id: Optional[BlockId],
-                   trg_block_id: Optional[BlockId]) -> None:
-        """Records a copy-on-write operation from source to target block id
-        Args:
-            src_block_id (BlockId): The source block id from which to copy 
-                the data
-            trg_block_id (BlockId): The target block id to which the data
-                is copied
-        """
-        assert src_block_id is not None
-        assert trg_block_id is not None
-        self._copy_on_writes.append((src_block_id, trg_block_id))
-
-    def clear_cows(self) -> List[Tuple[BlockId, BlockId]]:
-        """Clears the copy-on-write tracking information and returns the current
-        state.
-
-        This method returns a list mapping source block indices to
-         destination block indices for the current copy-on-write operations.
-        It then clears the internal tracking information.
-
-        Returns:
-            List[Tuple[BlockId, BlockId]]: A list mapping source
-                block indices to destination block indices for the
-                current copy-on-write operations.
-        """
-        cows = self._copy_on_writes
-        self._copy_on_writes = []
-        return cows
-
-
-class BlockPool:
-    """Used to pre-allocate block objects, in order to avoid excessive python
-    object allocations/deallocations.
-    The pool starts from "pool_size" objects and will increase to more objects
-    if necessary
-
-    Note that multiple block objects may point to the same physical block id,
-    which is why this pool is needed, so that it will be easier to support
-    prefix caching and more complicated sharing of physical blocks.
-    """
-
-    def __init__(self, block_size: int, create_block: Block.Factory,
-                 allocator: BlockAllocator, pool_size: int):
-        self._block_size = block_size
-        self._create_block = create_block
-        self._allocator = allocator
-        self._pool_size = pool_size
-        assert self._pool_size >= 0
-
-        self._free_ids: Deque[int] = deque(range(self._pool_size))
-        self._pool = []
-        for i in range(self._pool_size):
-            self._pool.append(
-                self._create_block(prev_block=None,
-                                   token_ids=[],
-                                   block_size=self._block_size,
-                                   allocator=self._allocator,
-                                   block_id=None,
-                                   extra_hash=None))
-
-    def increase_pool(self):
-        """Doubles the internal pool size
-        """
-        cur_pool_size = self._pool_size
-        new_pool_size = cur_pool_size * 2
-        self._pool_size = new_pool_size
-
-        self._free_ids += deque(range(cur_pool_size, new_pool_size))
-
-        for i in range(cur_pool_size, new_pool_size):
-            self._pool.append(
-                self._create_block(prev_block=None,
-                                   token_ids=[],
-                                   block_size=self._block_size,
-                                   allocator=self._allocator,
-                                   block_id=None,
-                                   extra_hash=None))
-
-    def init_block(self,
-                   prev_block: Optional[Block],
-                   token_ids: List[int],
-                   block_size: int,
-                   physical_block_id: Optional[int],
-                   extra_hash: Optional[int] = None) -> Block:
-        if len(self._free_ids) == 0:
-            self.increase_pool()
-            assert len(self._free_ids) > 0
-
-        pool_id = self._free_ids.popleft()
-
-        block = self._pool[pool_id]
-        block.__init__(  # type: ignore[misc]
-            prev_block=prev_block,
-            token_ids=token_ids,
-            block_size=block_size,
-            allocator=block._allocator,  # type: ignore[attr-defined] 
-            block_id=physical_block_id,
-            extra_hash=extra_hash)
-        block.pool_id = pool_id  # type: ignore[attr-defined]
-        return block
-
-    def free_block(self, block: Block) -> None:
-        self._free_ids.appendleft(block.pool_id)  # type: ignore[attr-defined]
-
-
-class BlockList:
-    """This class is an optimization to allow fast-access to physical 
-    block ids. It maintains a block id list that is updated with the 
-    block list and this avoids the need to reconstruct the block id 
-    list on every iteration of the block manager
-    """
-
-    def __init__(self, blocks: List[Block]):
-        self._blocks: List[Block] = []
-        self._block_ids: List[int] = []
-
-        self.update(blocks)
-
-    def _add_block_id(self, block_id: Optional[BlockId]) -> None:
-        assert block_id is not None
-        self._block_ids.append(block_id)
-
-    def _update_block_id(self, block_index: int,
-                         new_block_id: Optional[BlockId]) -> None:
-        assert new_block_id is not None
-        self._block_ids[block_index] = new_block_id
-
-    def update(self, blocks: List[Block]):
-        self._blocks = blocks
-
-        # Cache block ids for fast query
-        self._block_ids = []
-        for block in self._blocks:
-            self._add_block_id(block.block_id)
-
-    def append_token_ids(self, block_index: int, token_ids: List[int]) -> None:
-        block = self._blocks[block_index]
-        prev_block_id = block.block_id
-
-        block.append_token_ids(token_ids)
-
-        # CoW or promotion may update the internal block_id
-        if prev_block_id != block.block_id:
-            self._update_block_id(block_index, block.block_id)
-
-    def append(self, new_block: Block):
-        self._blocks.append(new_block)
-        self._add_block_id(new_block.block_id)
-
-    def __len__(self) -> int:
-        return len(self._blocks)
-
-    def __getitem__(self, block_index: int) -> Block:
-        return self._blocks[block_index]
-
-    def __setitem__(self, block_index: int, new_block: Block) -> None:
-        self._blocks[block_index] = new_block
-        self._update_block_id(block_index, new_block.block_id)
-
-    def reset(self):
-        self._blocks = []
-        self._block_ids = []
-
-    def list(self) -> List[Block]:
-        return self._blocks
-
-    def ids(self) -> List[int]:
-        return self._block_ids
-
-
-@dataclass
-class CacheMetricData:
-    """A utility dataclass to maintain cache metric.
-    To avoid overflow, we maintain the hit rate in block granularity, so that
-    we can maintain a single hit rate for n_completed_block x block_size,
-    and calculate the real time hit rate by the following:
-    BS = The number of queries per block.
-    nB = The number of completed blocks.
-    HR = hit rate of (nB x BS) queries.
-    Q = current number of queries (< BS).
-    H = current number of hits (< BS).
-    hit rate = ((HR x nB) + (H / Q) x (Q / BS)) / (nB + Q / BS)
-    """
-    num_completed_blocks: int = 0
-    completed_block_cache_hit_rate: float = 0.0
-    num_incompleted_block_queries: int = 0
-    num_incompleted_block_hit: int = 0
-    block_size: int = 1000
-
-    def query(self, hit: bool):
-        self.num_incompleted_block_queries += 1
-        self.num_incompleted_block_hit += 1 if hit else 0
-
-        # When a block is completed, update the cache hit rate
-        # and reset the incomplete numbers.
-        if self.num_incompleted_block_queries == self.block_size:
-            hit_rate = (self.num_incompleted_block_hit /
-                        self.num_incompleted_block_queries)
-            self.completed_block_cache_hit_rate = (
-                self.completed_block_cache_hit_rate * self.num_completed_blocks
-                + hit_rate) / (self.num_completed_blocks + 1)
-            self.num_incompleted_block_queries = 0
-            self.num_incompleted_block_hit = 0
-            self.num_completed_blocks += 1
-
-    def get_hit_rate(self):
-        incomplete_ratio = self.num_incompleted_block_queries / self.block_size
-        total_blocks = self.num_completed_blocks + incomplete_ratio
-        if total_blocks == 0:
-            return 0.0
-
-        completed_block_hit, incompleted_block_hit = 0.0, 0.0
-        if self.num_completed_blocks > 0:
-            completed_block_hit = (self.completed_block_cache_hit_rate *
-                                   self.num_completed_blocks)
-        if self.num_incompleted_block_queries > 0:
-            incompleted_hit_rate = (self.num_incompleted_block_hit /
-                                    self.num_incompleted_block_queries)
-            incompleted_block_hit = (incompleted_hit_rate * incomplete_ratio)
-        return (completed_block_hit + incompleted_block_hit) / total_blocks
-
-
-def get_all_blocks_recursively(last_block: Block) -> List[Block]:
-    """Retrieves all the blocks in a sequence starting from the last block.
-
-    This function recursively traverses the sequence of blocks in reverse order,
-    starting from the given last block, and returns a list of all the blocks in
-    the sequence.
-
-    Args:
-        last_block (Block): The last block in the sequence.
-
-    Returns:
-        List[Block]: A list of all the blocks in the sequence, in the order they
-            appear.
-    """
-
-    def recurse(block: Block, lst: List[Block]) -> None:
-        if block.prev_block is not None:
-            recurse(block.prev_block, lst)
-        lst.append(block)
-
-    all_blocks: List[Block] = []
-    recurse(last_block, all_blocks)
-    return all_blocks
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@ -1,439 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Dict, FrozenSet, List, Optional, Tuple
-
-from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
-                                        DeviceAwareBlockAllocator)
-from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
-from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
-from vllm.utils import Device
-
-
-class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
-    """A block allocator that can allocate blocks on both CPU and GPU memory.
-
-    This class implements the `DeviceAwareBlockAllocator` interface and provides
-    functionality for allocating and managing blocks of memory on both CPU and
-    GPU devices.
-
-    The `CpuGpuBlockAllocator` maintains separate memory pools for CPU and GPU
-    blocks, and allows for allocation, deallocation, forking, and swapping of
-    blocks across these memory pools.
-    """
-
-    @staticmethod
-    def create(
-        allocator_type: str,
-        num_gpu_blocks: int,
-        num_cpu_blocks: int,
-        block_size: int,
-    ) -> DeviceAwareBlockAllocator:
-        """Creates a CpuGpuBlockAllocator instance with the specified
-        configuration.
-
-        This static method creates and returns a CpuGpuBlockAllocator instance
-        based on the provided parameters. It initializes the CPU and GPU block
-        allocators with the specified number of blocks, block size, and
-        allocator type.
-
-        Args:
-            allocator_type (str): The type of block allocator to use for CPU
-                and GPU blocks. Currently supported values are "naive" and
-                "prefix_caching".
-            num_gpu_blocks (int): The number of blocks to allocate for GPU
-                memory.
-            num_cpu_blocks (int): The number of blocks to allocate for CPU
-                memory.
-            block_size (int): The size of each block in number of tokens.
-
-        Returns:
-            DeviceAwareBlockAllocator: A CpuGpuBlockAllocator instance with the
-                specified configuration.
-
-        Notes:
-            - The block IDs are assigned contiguously, with GPU block IDs coming
-                before CPU block IDs.
-        """
-        reserved_blocks = 0
-        block_ids = list(
-            range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
-        num_gpu_blocks -= reserved_blocks
-        gpu_block_ids = block_ids[:num_gpu_blocks]
-        cpu_block_ids = block_ids[num_gpu_blocks:]
-
-        if allocator_type == "naive":
-            gpu_allocator: BlockAllocator = NaiveBlockAllocator(
-                create_block=NaiveBlock,  # type: ignore
-                num_blocks=num_gpu_blocks,
-                block_size=block_size,
-                block_ids=gpu_block_ids,
-            )
-
-            cpu_allocator: BlockAllocator = NaiveBlockAllocator(
-                create_block=NaiveBlock,  # type: ignore
-                num_blocks=num_cpu_blocks,
-                block_size=block_size,
-                block_ids=cpu_block_ids,
-            )
-        elif allocator_type == "prefix_caching":
-            gpu_allocator = PrefixCachingBlockAllocator(
-                num_blocks=num_gpu_blocks,
-                block_size=block_size,
-                block_ids=gpu_block_ids,
-            )
-
-            cpu_allocator = PrefixCachingBlockAllocator(
-                num_blocks=num_cpu_blocks,
-                block_size=block_size,
-                block_ids=cpu_block_ids,
-            )
-        else:
-            raise ValueError(f"Unknown allocator type {allocator_type=}")
-
-        return CpuGpuBlockAllocator(
-            cpu_block_allocator=cpu_allocator,
-            gpu_block_allocator=gpu_allocator,
-        )
-
-    def __init__(self, cpu_block_allocator: BlockAllocator,
-                 gpu_block_allocator: BlockAllocator):
-        assert not (
-            cpu_block_allocator.all_block_ids
-            & gpu_block_allocator.all_block_ids
-        ), "cpu and gpu block allocators can't have intersection of block ids"
-
-        self._allocators = {
-            Device.CPU: cpu_block_allocator,
-            Device.GPU: gpu_block_allocator,
-        }
-
-        self._swap_mapping: Dict[int, int] = {}
-        self._null_block: Optional[Block] = None
-
-        self._block_ids_to_allocator: Dict[int, BlockAllocator] = {}
-        for _, allocator in self._allocators.items():
-            for block_id in allocator.all_block_ids:
-                self._block_ids_to_allocator[block_id] = allocator
-
-    def allocate_or_get_null_block(self) -> Block:
-        if self._null_block is None:
-            self._null_block = NullBlock(
-                self.allocate_mutable_block(None, Device.GPU))
-        return self._null_block
-
-    def allocate_mutable_block(self,
-                               prev_block: Optional[Block],
-                               device: Device,
-                               extra_hash: Optional[int] = None) -> Block:
-        """Allocates a new mutable block on the specified device.
-
-        Args:
-            prev_block (Optional[Block]): The previous block to in the sequence.
-                Used for prefix hashing.
-            device (Device): The device on which to allocate the new block.
-            extra_hash (Optional[int]): The hash value of additional
-                factors, such as adapters, that influence the block hash
-                in the prefix caching block.
-
-        Returns:
-            Block: The newly allocated mutable block.
-        """
-        return self._allocators[device].allocate_mutable_block(
-            prev_block, extra_hash=extra_hash)
-
-    def allocate_immutable_blocks(
-            self,
-            prev_block: Optional[Block],
-            block_token_ids: List[List[int]],
-            device: Device,
-            extra_hash: Optional[int] = None) -> List[Block]:
-        """Allocates a new group of immutable blocks with the provided block 
-        token IDs on the specified device.
-
-        Args:
-            prev_block (Optional[Block]): The previous block in the sequence.
-                Used for prefix hashing.
-            block_token_ids (List[int]): The list of block token IDs to be 
-                stored in the new blocks.
-            device (Device): The device on which to allocate the new block.
-            extra_hash (Optional[int]): The hash value of additional
-                factors, such as adapters, that influence the block hash
-                in the prefix caching block.
-
-        Returns:
-            List[Block]: The newly allocated list of immutable blocks 
-                containing the provided block token IDs.
-        """
-        return self._allocators[device].allocate_immutable_blocks(
-            prev_block, block_token_ids, extra_hash=extra_hash)
-
-    def allocate_immutable_block(self,
-                                 prev_block: Optional[Block],
-                                 token_ids: List[int],
-                                 device: Device,
-                                 extra_hash: Optional[int] = None) -> Block:
-        """Allocates a new immutable block with the provided token IDs on the
-        specified device.
-
-        Args:
-            prev_block (Optional[Block]): The previous block in the sequence.
-                Used for prefix hashing.
-            token_ids (List[int]): The list of token IDs to be stored in the new
-                block.
-            device (Device): The device on which to allocate the new block.
-            extra_hash (Optional[int]): The hash value of additional
-                factors, such as adapters, that influence the block hash
-                in the prefix caching block.
-
-        Returns:
-            Block: The newly allocated immutable block containing the provided
-                token IDs.
-        """
-        return self._allocators[device].allocate_immutable_block(
-            prev_block, token_ids, extra_hash=extra_hash)
-
-    def free(self, block: Block) -> None:
-        """Frees the memory occupied by the given block.
-
-        Args:
-            block (Block): The block to be freed.
-        """
-        # Null block should never be freed
-        if isinstance(block, NullBlock):
-            return
-        block_id = block.block_id
-        assert block_id is not None
-        allocator = self._block_ids_to_allocator[block_id]
-        allocator.free(block)
-
-    def fork(self, last_block: Block) -> List[Block]:
-        """Creates a new sequence of blocks that shares the same underlying
-            memory as the original sequence.
-
-        Args:
-            last_block (Block): The last block in the original sequence.
-
-        Returns:
-            List[Block]: A new list of blocks that shares the same memory as the
-                original sequence.
-        """
-        # do not attempt to fork the null block
-        assert not isinstance(last_block, NullBlock)
-        block_id = last_block.block_id
-        assert block_id is not None
-        allocator = self._block_ids_to_allocator[block_id]
-        return allocator.fork(last_block)
-
-    def get_num_free_blocks(self, device: Device) -> int:
-        """Returns the number of free blocks available on the specified device.
-
-        Args:
-            device (Device): The device for which to query the number of free
-                blocks. AssertionError is raised if None is passed.
-
-        Returns:
-            int: The number of free blocks available on the specified device.
-        """
-        return self._allocators[device].get_num_free_blocks()
-
-    def get_num_total_blocks(self, device: Device) -> int:
-        return self._allocators[device].get_num_total_blocks()
-
-    def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
-        """Returns the zero-offset block id on certain device given the 
-        absolute block id.
-
-        Args:
-            device (Device): The device for which to query relative block id.
-                absolute_id (int): The absolute block id for the block in 
-                whole allocator.
-
-        Returns:
-            int: The zero-offset block id on certain device.
-        """
-        return self._allocators[device].get_physical_block_id(absolute_id)
-
-    def swap(self, blocks: List[Block], src_device: Device,
-             dst_device: Device) -> Dict[int, int]:
-        """Execute the swap for the given blocks from source_device
-        on to dest_device, save the current swap mapping and append 
-        them to the accumulated `self._swap_mapping` for each 
-        scheduling move.
-
-        Args:
-            blocks: List of blocks to be swapped.
-            src_device (Device): Device to swap the 'blocks' from.
-            dst_device (Device): Device to swap the 'blocks' to.
-        
-        Returns:
-            Dict[int, int]: Swap mapping from source_device
-                on to dest_device.
-        """
-        src_block_ids = [block.block_id for block in blocks]
-        self._allocators[src_device].swap_out(blocks)
-        self._allocators[dst_device].swap_in(blocks)
-        dst_block_ids = [block.block_id for block in blocks]
-
-        current_swap_mapping: Dict[int, int] = {}
-        for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids):
-            if src_block_id is not None and dst_block_id is not None:
-                self._swap_mapping[src_block_id] = dst_block_id
-                current_swap_mapping[src_block_id] = dst_block_id
-        return current_swap_mapping
-
-    def get_num_full_blocks_touched(self, blocks: List[Block],
-                                    device: Device) -> int:
-        """Returns the number of full blocks that will be touched by
-        swapping in/out the given blocks on to the 'device'.
-
-        Args:
-            blocks: List of blocks to be swapped.
-            device (Device): Device to swap the 'blocks' on.
-
-        Returns:
-            int: the number of full blocks that will be touched by
-                swapping in/out the given blocks on to the 'device'.
-                Non full blocks are ignored when deciding the number
-                of blocks to touch.
-        """
-        return self._allocators[device].get_num_full_blocks_touched(blocks)
-
-    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
-        """Clears the copy-on-write (CoW) state and returns the mapping of
-            source to destination block IDs.
-
-        Returns:
-            List[Tuple[int, int]]: A list mapping source block IDs to 
-                destination block IDs.
-        """
-        # CoW only supported on GPU
-        device = Device.GPU
-        return self._allocators[device].clear_copy_on_writes()
-
-    def mark_blocks_as_accessed(self, block_ids: List[int],
-                                now: float) -> None:
-        """Mark blocks as accessed, only use for prefix caching."""
-        # Prefix caching only supported on GPU.
-        device = Device.GPU
-        return self._allocators[device].mark_blocks_as_accessed(block_ids, now)
-
-    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
-        """Mark blocks as accessed, only use for prefix caching."""
-        # Prefix caching only supported on GPU.
-        device = Device.GPU
-        return self._allocators[device].mark_blocks_as_computed(block_ids)
-
-    def get_common_computed_block_ids(
-            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
-        # Prefix caching only supported on GPU.
-        device = Device.GPU
-        return self._allocators[device].get_common_computed_block_ids(
-            computed_seq_block_ids)
-
-    @property
-    def all_block_ids(self) -> FrozenSet[int]:
-        return frozenset(self._block_ids_to_allocator.keys())
-
-    def get_prefix_cache_hit_rate(self, device: Device) -> float:
-        """Prefix cache hit rate. -1 means not supported or disabled."""
-        assert device in self._allocators
-        return self._allocators[device].get_prefix_cache_hit_rate()
-
-    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
-        """Reset prefix cache for specified or all devices."""
-        if device:
-            return self._allocators[device].reset_prefix_cache()
-        success = True
-        for allocator in self._allocators.values():
-            success = success and allocator.reset_prefix_cache()
-        return success
-
-    def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
-        """Returns and clears the mapping of source to destination block IDs.
-        Will be called after every swapping operations for now, and after every
-        schedule when BlockManagerV2 become default. Currently not useful.
-
-        Returns:
-            List[Tuple[int, int]]: A mapping of source to destination block IDs.
-        """
-        mapping = self._swap_mapping.copy()
-        self._swap_mapping.clear()
-        return list(mapping.items())
-
-    def find_cached_blocks_prefix(
-        self,
-        block_hashes: List[int],
-        device: Device = Device.GPU,
-    ) -> List[int]:
-        return self._allocators[device].find_cached_blocks_prefix(block_hashes)
-
-
-class NullBlock(Block):
-    """
-    Null blocks are used as a placeholders for KV cache blocks that have
-    been dropped due to sliding window.
-    This implementation just wraps an ordinary block and prevents it from
-    being modified. It also allows for testing if a block is NullBlock
-    via isinstance().
-    """
-
-    def __init__(self, proxy: Block):
-        super().__init__()
-        self._proxy = proxy
-
-    def append_token_ids(self, token_ids: List[BlockId]):
-        raise ValueError("null block should not be modified")
-
-    @property
-    def block_id(self):
-        return self._proxy.block_id
-
-    @block_id.setter
-    def block_id(self, value: Optional[BlockId]):
-        raise ValueError("null block should not be modified")
-
-    @property
-    def token_ids(self) -> List[BlockId]:
-        return self._proxy.token_ids
-
-    @property
-    def num_tokens_total(self) -> int:
-        raise NotImplementedError(
-            "num_tokens_total is not used for null block")
-
-    @property
-    def num_empty_slots(self) -> BlockId:
-        return self._proxy.num_empty_slots
-
-    @property
-    def is_full(self):
-        return self._proxy.is_full
-
-    @property
-    def prev_block(self):
-        return self._proxy.prev_block
-
-    @property
-    def extra_hash(self):
-        return None
-
-    @property
-    def computed(self):
-        return self._proxy.computed
-
-    @computed.setter
-    def computed(self, value):
-        self._proxy.computed = value
-
-    @property
-    def last_accessed(self) -> float:
-        return self._proxy.last_accessed
-
-    @last_accessed.setter
-    def last_accessed(self, last_accessed_ts: float):
-        self._proxy.last_accessed = last_accessed_ts
-
-    @property
-    def content_hash(self):
-        return self._proxy.content_hash
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@ -1,319 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from abc import ABC, abstractmethod
-from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
-
-from vllm.utils import Device
-
-BlockId = int
-
-
-class Block(ABC):
-
-    @abstractmethod
-    def append_token_ids(self, token_ids: List[int]) -> None:
-        pass
-
-    @property
-    @abstractmethod
-    def block_id(self) -> Optional[int]:
-        pass
-
-    @block_id.setter
-    @abstractmethod
-    def block_id(self, value: Optional[int]) -> None:
-        """NOTE: Do not use this API outside Block."""
-        self._block_id = value
-
-    @property
-    @abstractmethod
-    def token_ids(self) -> List[int]:
-        pass
-
-    @property
-    @abstractmethod
-    def num_tokens_total(self) -> int:
-        """The number of tokens till the current block (inclusive)
-        """
-        pass
-
-    @property
-    @abstractmethod
-    def num_empty_slots(self) -> int:
-        pass
-
-    @property
-    @abstractmethod
-    def is_full(self) -> bool:
-        pass
-
-    @property
-    @abstractmethod
-    def prev_block(self) -> Optional["Block"]:
-        pass
-
-    @property
-    @abstractmethod
-    def extra_hash(self) -> Optional[int]:
-        return None
-
-    @property
-    @abstractmethod
-    def computed(self) -> bool:
-        raise NotImplementedError
-
-    @computed.setter
-    @abstractmethod
-    def computed(self, value) -> bool:
-        """Should be only used by PrefixCacingAllocator"""
-        raise NotImplementedError
-
-    @property
-    @abstractmethod
-    def last_accessed(self) -> float:
-        raise NotImplementedError
-
-    @last_accessed.setter
-    @abstractmethod
-    def last_accessed(self, last_accessed_ts: float):
-        raise NotImplementedError
-
-    class Factory(Protocol):
-
-        @abstractmethod
-        def __call__(
-            self,
-            prev_block: Optional["Block"],
-            token_ids: List[int],
-            block_size: int,
-            allocator: "BlockAllocator",
-            block_id: Optional[int] = None,
-            computed: bool = False,
-            extra_hash: Optional[int] = None,
-        ) -> "Block":
-            pass
-
-    @property
-    @abstractmethod
-    def content_hash(self) -> Optional[int]:
-        """Return the content-based hash of the current block, or None if it is
-        not yet defined or not supported.
-
-        For the content-based hash to be defined, the current block must be
-        full.
-        """
-        return None
-
-
-class BlockAllocator(ABC):
-
-    @abstractmethod
-    def allocate_mutable_block(self, prev_block: Optional[Block],
-                               extra_hash: Optional[int]) -> Block:
-        pass
-
-    @abstractmethod
-    def allocate_immutable_block(self, prev_block: Optional[Block],
-                                 token_ids: List[int],
-                                 extra_hash: Optional[int]) -> Block:
-        pass
-
-    @abstractmethod
-    def allocate_immutable_blocks(self, prev_block: Optional[Block],
-                                  block_token_ids: List[List[int]],
-                                  extra_hash: Optional[int]) -> List[Block]:
-        pass
-
-    @abstractmethod
-    def free(self, block: Block) -> None:
-        pass
-
-    @abstractmethod
-    def fork(self, last_block: Block) -> List[Block]:
-        pass
-
-    @abstractmethod
-    def get_num_total_blocks(self) -> int:
-        pass
-
-    @abstractmethod
-    def get_num_free_blocks(self) -> int:
-        pass
-
-    @abstractmethod
-    def get_physical_block_id(self, absolute_id: int) -> int:
-        pass
-
-    @abstractmethod
-    def swap_out(self, blocks: List[Block]) -> None:
-        pass
-
-    @abstractmethod
-    def swap_in(self, blocks: List[Block]) -> None:
-        pass
-
-    @property
-    @abstractmethod
-    def all_block_ids(self) -> FrozenSet[int]:
-        pass
-
-    @abstractmethod
-    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
-        pass
-
-    @abstractmethod
-    def mark_blocks_as_accessed(self, block_ids: List[int],
-                                now: float) -> None:
-        pass
-
-    @abstractmethod
-    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
-        pass
-
-    @abstractmethod
-    def get_common_computed_block_ids(
-            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
-        pass
-
-    @abstractmethod
-    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
-        """NOTE: This should not be used besides Block"""
-        pass
-
-    @abstractmethod
-    def promote_to_immutable_block(self, block: Block) -> BlockId:
-        """NOTE: This should not be used besides Block"""
-        pass
-
-    @abstractmethod
-    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
-        pass
-
-    @abstractmethod
-    def get_prefix_cache_hit_rate(self) -> float:
-        """Prefix cache hit rate. -1 means not supported or disabled."""
-        pass
-
-    @abstractmethod
-    def reset_prefix_cache(self) -> bool:
-        """Reset prefix cache."""
-        pass
-
-    class NoFreeBlocksError(ValueError):
-        pass
-
-    @abstractmethod
-    def find_cached_blocks_prefix(
-        self,
-        block_hashes: List[int],
-    ) -> List[int]:
-        pass
-
-
-class DeviceAwareBlockAllocator(ABC):
-
-    @abstractmethod
-    def allocate_mutable_block(self,
-                               prev_block: Optional[Block],
-                               device: Device,
-                               extra_hash: Optional[int] = None) -> Block:
-        pass
-
-    @abstractmethod
-    def allocate_immutable_block(self,
-                                 prev_block: Optional[Block],
-                                 token_ids: List[int],
-                                 device: Device,
-                                 extra_hash: Optional[int] = None) -> Block:
-        pass
-
-    @abstractmethod
-    def allocate_immutable_blocks(
-        self,
-        prev_block: Optional[Block],
-        block_token_ids: List[List[int]],
-        device: Device,
-        extra_hash: Optional[int] = None,
-    ) -> List[Block]:
-        pass
-
-    @abstractmethod
-    def get_num_free_blocks(self, device: Device) -> int:
-        pass
-
-    @abstractmethod
-    def get_num_total_blocks(self, device: Device) -> int:
-        pass
-
-    @abstractmethod
-    def free(self, block: Block) -> None:
-        pass
-
-    @abstractmethod
-    def fork(self, last_block: Block) -> List[Block]:
-        pass
-
-    @property
-    @abstractmethod
-    def all_block_ids(self) -> FrozenSet[int]:
-        pass
-
-    @abstractmethod
-    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
-        pass
-
-    @abstractmethod
-    def mark_blocks_as_accessed(self, block_ids: List[int],
-                                now: float) -> None:
-        pass
-
-    @abstractmethod
-    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
-        pass
-
-    @abstractmethod
-    def get_common_computed_block_ids(
-            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
-        pass
-
-    @abstractmethod
-    def get_num_full_blocks_touched(self, blocks: List[Block],
-                                    device: Device) -> int:
-        pass
-
-    @abstractmethod
-    def swap(self, blocks: List[Block], src_device: Device,
-             dst_device: Device) -> Dict[int, int]:
-        pass
-
-    @abstractmethod
-    def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
-        pass
-
-    @abstractmethod
-    def allocate_or_get_null_block(self) -> Block:
-        """
-        Null blocks are used as a placeholders for KV cache blocks that have
-        been dropped due to sliding window.
-        There is at most one null block per allocator.
-        """
-        pass
-
-    @abstractmethod
-    def get_prefix_cache_hit_rate(self, device: Device) -> float:
-        """Prefix cache hit rate. -1 means not supported or disabled."""
-        pass
-
-    @abstractmethod
-    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
-        """Reset prefix cache."""
-        pass
-
-    @abstractmethod
-    def find_cached_blocks_prefix(
-        self,
-        block_hashes: List[int],
-        device: Device = Device.GPU,
-    ) -> List[int]:
-        pass
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@ -1,466 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from collections import deque
-from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
-
-from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
-                                    get_all_blocks_recursively)
-from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
-
-Refcount = int
-
-
-class NaiveBlockAllocator(BlockAllocator):
-    """A simple block allocator that manages blocks of memory without prefix
-    caching.
-
-    Args:
-        create_block (Block.Factory): A factory function for creating new
-            blocks. This is used when a NaiveBlockAllocator is composed within
-            a prefix caching allocator -- the naive block allocator must
-            construct prefix caching blocks (but shouldn't know anything else
-            about them).
-        num_blocks (int): The total number of blocks to manage.
-        block_size (int): The size of each block in tokens.
-        block_ids (Optional[Iterable[int]], optional): An optional iterable of
-            block IDs. If not provided, block IDs will be assigned sequentially
-            from 0 to num_blocks - 1.
-    """
-
-    def __init__(
-        self,
-        create_block: Block.Factory,
-        num_blocks: int,
-        block_size: int,
-        block_ids: Optional[Iterable[int]] = None,
-        block_pool: Optional[BlockPool] = None,
-    ):
-        if block_ids is None:
-            block_ids = range(num_blocks)
-
-        self._free_block_indices: Deque[BlockId] = deque(block_ids)
-        self._all_block_indices = frozenset(block_ids)
-        assert len(self._all_block_indices) == num_blocks
-
-        self._refcounter = RefCounter(
-            all_block_indices=self._free_block_indices)
-        self._block_size = block_size
-
-        self._cow_tracker = CopyOnWriteTracker(
-            refcounter=self._refcounter.as_readonly())
-
-        if block_pool is None:
-            extra_factor = 4
-            # Pre-allocate "num_blocks * extra_factor" block objects.
-            # The "* extra_factor" is a buffer to allow more block objects
-            # than physical blocks
-            self._block_pool = BlockPool(self._block_size, create_block, self,
-                                         num_blocks * extra_factor)
-        else:
-            # In this case, the block pool is provided by the caller,
-            # which means that there is most likely a need to share
-            # a block pool between allocators
-            self._block_pool = block_pool
-
-    def allocate_immutable_block(self,
-                                 prev_block: Optional[Block],
-                                 token_ids: List[int],
-                                 extra_hash: Optional[int] = None,
-                                 device: Optional[Device] = None) -> Block:
-        """Allocates a new immutable block with the given token IDs, linked to
-        the previous block.
-
-        Args:
-            prev_block (Optional[Block]): The previous block in the sequence. If
-                None, then the block to be allocated is the first block in the
-                sequence.
-            token_ids (List[int]): The token IDs to be stored in the new block.
-
-        Returns:
-            Block: The newly allocated immutable block.
-        """
-        assert device is None
-        block = self.allocate_mutable_block(prev_block=prev_block)
-        block.append_token_ids(token_ids)
-        return block
-
-    def allocate_immutable_blocks(
-            self,
-            prev_block: Optional[Block],
-            block_token_ids: List[List[int]],
-            extra_hash: Optional[int] = None,
-            device: Optional[Device] = None) -> List[Block]:
-        assert device is None
-        num_blocks = len(block_token_ids)
-
-        block_ids = []
-        for i in range(num_blocks):
-            block_ids.append(self._allocate_block_id())
-
-        blocks = []
-        for i in range(num_blocks):
-            prev_block = self._block_pool.init_block(
-                prev_block=prev_block,
-                token_ids=block_token_ids[i],
-                block_size=self._block_size,
-                physical_block_id=block_ids[i])
-            blocks.append(prev_block)
-
-        return blocks
-
-    def allocate_mutable_block(self,
-                               prev_block: Optional[Block],
-                               extra_hash: Optional[int] = None,
-                               device: Optional[Device] = None) -> Block:
-        """Allocates a new mutable block, linked to the previous block.
-
-        Args:
-            prev_block (Optional[Block]): The previous block in the sequence. If
-                None, then the block to be allocated is the first block in the
-                sequence.
-
-        Returns:
-            Block: The newly allocated mutable block.
-        """
-        assert device is None
-        block_id = self._allocate_block_id()
-        block = self._block_pool.init_block(prev_block=prev_block,
-                                            token_ids=[],
-                                            block_size=self._block_size,
-                                            physical_block_id=block_id)
-        return block
-
-    def _allocate_block_id(self) -> BlockId:
-        if not self._free_block_indices:
-            raise BlockAllocator.NoFreeBlocksError()
-
-        block_id = self._free_block_indices.popleft()
-        self._refcounter.incr(block_id)
-        return block_id
-
-    def _free_block_id(self, block: Union[Block, BlockId]) -> None:
-        if isinstance(block, Block):
-            block_id = block.block_id
-            block.block_id = None
-        else:
-            block_id = block
-        assert block_id is not None
-
-        refcount = self._refcounter.decr(block_id)
-        if refcount == 0:
-            self._free_block_indices.appendleft(block_id)
-
-    def free(self, block: Block, keep_block_object: bool = False) -> None:
-        # Release the physical block id
-        self._free_block_id(block)
-
-        # Release the block object
-        if not keep_block_object:
-            self._block_pool.free_block(block)
-
-    def free_block_id(self, block_id: BlockId) -> None:
-        self._free_block_id(block_id)
-
-    def fork(self, last_block: Block) -> List[Block]:
-        """Creates a new sequence of blocks that shares the same underlying
-        memory as the original sequence.
-
-        Args:
-            last_block (Block): The last block in the original sequence.
-
-        Returns:
-            List[Block]: The new sequence of blocks that shares the same memory
-                as the original sequence.
-        """
-        source_blocks = get_all_blocks_recursively(last_block)
-
-        forked_blocks: List[Block] = []
-        prev_block = None
-        for block in source_blocks:
-
-            # Increment refcount for each block.
-            assert block.block_id is not None
-            refcount = self._refcounter.incr(block.block_id)
-            assert refcount != 1, "can't fork freed block"
-
-            forked_block = self._block_pool.init_block(
-                prev_block=prev_block,
-                token_ids=block.token_ids,
-                block_size=self._block_size,
-                physical_block_id=block.block_id)
-
-            forked_blocks.append(forked_block)
-            prev_block = forked_blocks[-1]
-
-        return forked_blocks
-
-    def get_num_free_blocks(self) -> int:
-        return len(self._free_block_indices)
-
-    def get_num_total_blocks(self) -> int:
-        return len(self._all_block_indices)
-
-    def get_physical_block_id(self, absolute_id: int) -> int:
-        """Returns the zero-offset block id on certain block allocator
-        given the absolute block id.
-
-        Args:
-            absolute_id (int): The absolute block id for the block 
-                in whole allocator.
-
-        Returns:
-            int: The zero-offset block id on certain device.
-        """
-        return sorted(self._all_block_indices).index(absolute_id)
-
-    @property
-    def refcounter(self):
-        return self._refcounter
-
-    @property
-    def all_block_ids(self) -> FrozenSet[int]:
-        return self._all_block_indices
-
-    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
-        """Performs a copy-on-write operation on the given block if it is not
-        appendable.
-
-        Args:
-            block (Block): The block to check for copy-on-write.
-
-        Returns:
-            BlockId: The block index of the new block if a copy-on-write 
-                operation was performed, or the original block index if
-                no copy-on-write was necessary.
-        """
-        src_block_id = block.block_id
-        assert src_block_id is not None
-
-        if self._cow_tracker.is_appendable(block):
-            return src_block_id
-
-        self._free_block_id(block)
-        trg_block_id = self._allocate_block_id()
-
-        self._cow_tracker.record_cow(src_block_id, trg_block_id)
-
-        return trg_block_id
-
-    def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
-        """Returns the copy-on-write source->destination mapping and clears it.
-
-        Returns:
-            List[Tuple[BlockId, BlockId]]: A list mapping source
-                block indices to destination block indices.
-        """
-        return self._cow_tracker.clear_cows()
-
-    def mark_blocks_as_accessed(self, block_ids: List[int],
-                                now: float) -> None:
-        """Mark blocks as accessed, used in prefix caching.
-
-        Since the naive allocator does not implement prefix caching, we do
-        nothing.
-        """
-        pass
-
-    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
-        """Mark blocks as computed, used in prefix caching.
-
-        Since the naive allocator does not implement prefix caching, we do
-        nothing.
-        """
-        pass
-
-    def get_common_computed_block_ids(
-            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
-        """Determine blocks that can be skipped in prefill.
-
-        Since the naive allocator does not support prefix caching, always return
-        an empty list.
-        """
-        return []
-
-    def promote_to_immutable_block(self, block: Block) -> BlockId:
-        raise NotImplementedError("There is no promotion for naive blocks")
-
-    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
-        """Returns the number of full blocks that will be touched by
-        swapping in/out.
-
-        Args:
-            blocks: List of blocks to be swapped.
-        Returns:
-            int: the number of full blocks that will be touched by
-                swapping in/out the given blocks. Non full blocks are ignored
-                when deciding the number of blocks to touch.
-        """
-        # NOTE: for naive block, we use set to eliminate common blocks among
-        # seqs, also we compare the empty slots in the mutable blocks with
-        # lookahead slots to get the number of unique new block that are
-        # needed.
-        old_block_set = set()
-        for block in blocks:
-            if block.is_full:
-                old_block_set.add(block)
-        return len(old_block_set)
-
-    def swap_out(self, blocks: List[Block]) -> None:
-        for block in blocks:
-            self._free_block_id(block)
-
-    def swap_in(self, blocks: List[Block]) -> None:
-        for block in blocks:
-            # Here we allocate either immutable or mutable block and then
-            # extract its block_id. Note that the block object is released
-            # and the block_id is assigned to "block" to allow reusing the
-            # existing "block" object
-            if block.is_full:
-                tmp_block = self.allocate_immutable_block(
-                    prev_block=block.prev_block, token_ids=block.token_ids)
-            else:
-                tmp_block = self.allocate_mutable_block(
-                    prev_block=block.prev_block)
-                tmp_block.append_token_ids(block.token_ids)
-
-            block_id = tmp_block.block_id
-            tmp_block.block_id = None
-            self._block_pool.free_block(tmp_block)
-
-            block.block_id = block_id  # Assign block_id
-
-    def get_prefix_cache_hit_rate(self) -> float:
-        return -1
-
-    def reset_prefix_cache(self) -> bool:
-        """No prefix cache for naive block allocator."""
-        return True
-
-    def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
-        # Not applicable for naive block allocator.
-        return []
-
-
-class NaiveBlock(Block):
-    """An implementation of the Block class that does not support prefix
-    caching.
-
-    The NaiveBlock class represents a block of token IDs with a fixed size. It
-    provides methods for appending token IDs to the block and manages copy-on
-    -write operations when necessary.
-
-    Args:
-        prev_block (Block): The previous block in the sequence.
-        token_ids (List[int]): The initial token IDs to be stored in the block.
-        block_size (int): The maximum number of token IDs that can be stored in
-            the block.
-        allocator (BlockAllocator): The block allocator associated with this
-            block.
-        block_id (Optional[int], optional): The physical block index
-            of this block. Defaults to None, which means no allocation has been
-            made.
-        _cow_target (Optional[Block], optional): The copy-on-write target block.
-            If not provided, it defaults to self.
-    """
-
-    def __init__(self,
-                 prev_block: Optional[Block],
-                 token_ids: List[int],
-                 block_size: int,
-                 allocator: BlockAllocator,
-                 block_id: Optional[int] = None,
-                 _cow_target: Optional[Block] = None,
-                 extra_hash: Optional[int] = None):
-        self._token_ids: List[int] = []
-        self._block_size = block_size
-        self._prev_block = prev_block
-        self._block_id = block_id
-        self._allocator = allocator
-        self._cow_target = _cow_target if _cow_target is not None else self
-
-        self._append_token_ids_no_cow(token_ids)
-
-    def append_token_ids(self, token_ids: List[int]) -> None:
-        """Appends the given token IDs to the block and performs a 
-        copy-on-write if necessary.
-
-        Args:
-            token_ids (Optional[List[int]]): The token IDs to be appended 
-                to the block.
-        """
-        self._append_token_ids_no_cow(token_ids)
-
-        if self._block_id is not None:
-            self._block_id = (self._allocator.cow_block_if_not_appendable(
-                self._cow_target))
-
-    def _append_token_ids_no_cow(self, token_ids: List[int]) -> None:
-        """Appends the given token IDs to the block
-
-        Args:
-            token_ids (List[int]): The token IDs to be appended to the block.
-        """
-        if len(token_ids) == 0:
-            return
-
-        assert len(token_ids) <= self.num_empty_slots
-
-        self._token_ids.extend(token_ids)
-
-    @property
-    def computed(self) -> bool:
-        raise NotImplementedError
-
-    @computed.setter
-    def computed(self, value) -> None:
-        raise NotImplementedError
-
-    @property
-    def last_accessed(self) -> float:
-        raise NotImplementedError
-
-    @last_accessed.setter
-    def last_accessed(self, last_accessed_ts: float):
-        raise NotImplementedError
-
-    @property
-    def block_id(self) -> Optional[int]:
-        return self._block_id
-
-    @block_id.setter
-    def block_id(self, value: Optional[int]) -> None:
-        self._block_id = value
-
-    @property
-    def is_full(self) -> bool:
-        return self.num_empty_slots == 0
-
-    @property
-    def num_empty_slots(self) -> int:
-        return self._block_size - len(self.token_ids)
-
-    @property
-    def token_ids(self) -> List[int]:
-        return self._token_ids
-
-    @property
-    def num_tokens_total(self) -> int:
-        raise NotImplementedError(
-            "num_tokens_total is not used for naive block")
-
-    @property
-    def block_size(self) -> int:
-        return self._block_size
-
-    @property
-    def prev_block(self) -> Optional["Block"]:
-        return self._prev_block
-
-    @property
-    def extra_hash(self):
-        return None
-
-    @property
-    def content_hash(self) -> Optional[int]:
-        return None
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
@ -1,28 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Block manager utils."""
-from vllm.sequence import SequenceGroup
-from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
-                        STR_NOT_IMPL_ENC_DEC_SWA)
-
-
-def check_no_caching_or_swa_for_blockmgr_encdec(
-        block_mgr, seq_group: SequenceGroup) -> None:
-    '''
-    Enforce that prefix caching & sliding-window attention (SWA)
-    are currently unsupported *specifically* for encoder/decoder models.
-
-    Raises NotImplementedError if unsupported scenario is detected.
-
-    Arguments:
-
-    * block_mgr: BlockSpaceManager instance
-    * seq_group: SequenceGroup passed to block_mgr
-    '''
-
-    if seq_group.is_encoder_decoder():
-        if block_mgr.max_block_sliding_window is not None:
-            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
-
-        if block_mgr.enable_caching:
-            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@ -1,523 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""A block manager that manages token blocks."""
-from typing import Dict, List, Optional
-from typing import Sequence as GenericSequence
-from typing import Tuple
-
-from vllm.core.block.block_table import BlockTable
-from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
-from vllm.core.block.interfaces import Block
-from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
-                                                  LastAccessBlocksTracker)
-from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
-from vllm.core.interfaces import AllocStatus, BlockSpaceManager
-from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
-from vllm.utils import Device
-
-SeqId = int
-EncoderSeqId = str
-
-
-class SelfAttnBlockSpaceManager(BlockSpaceManager):
-    """BlockSpaceManager which manages the allocation of KV cache.
-
-    It owns responsibility for allocation, swapping, allocating memory for
-    autoregressively-generated tokens, and other advanced features such as
-    prefix caching, forking/copy-on-write, and sliding-window memory allocation.
-
-    This class implements the design described in
-    https://github.com/vllm-project/vllm/pull/3492.
-
-    Lookahead slots
-        The block manager has the notion of a "lookahead slot". These are slots
-        in the KV cache that are allocated for a sequence. Unlike the other
-        allocated slots, the content of these slots is undefined -- the worker
-        may use the memory allocations in any way.
-
-        In practice, a worker could use these lookahead slots to run multiple
-        forward passes for a single scheduler invocation. Each successive
-        forward pass would write KV activations to the corresponding lookahead
-        slot. This allows low inter-token latency use-cases, where the overhead
-        of continuous batching scheduling is amortized over >1 generated tokens.
-
-        Speculative decoding uses lookahead slots to store KV activations of
-        proposal tokens.
-
-        See https://github.com/vllm-project/vllm/pull/3250 for more information
-        on lookahead scheduling.
-
-    Args:
-        block_size (int): The size of each memory block.
-        num_gpu_blocks (int): The number of memory blocks allocated on GPU.
-        num_cpu_blocks (int): The number of memory blocks allocated on CPU.
-        watermark (float, optional): The threshold used for memory swapping.
-            Defaults to 0.01.
-        sliding_window (Optional[int], optional): The size of the sliding
-            window. Defaults to None.
-        enable_caching (bool, optional): Flag indicating whether caching is
-            enabled. Defaults to False.
-    """
-
-    def __init__(
-        self,
-        block_size: int,
-        num_gpu_blocks: int,
-        num_cpu_blocks: int,
-        watermark: float = 0.01,
-        sliding_window: Optional[int] = None,
-        enable_caching: bool = False,
-    ) -> None:
-        self.block_size = block_size
-        self.num_total_gpu_blocks = num_gpu_blocks
-        self.num_total_cpu_blocks = num_cpu_blocks
-
-        self.sliding_window = sliding_window
-        # max_block_sliding_window is the max number of blocks that need to be
-        # allocated
-        self.max_block_sliding_window = None
-        if sliding_window is not None:
-            # +1 here because // rounds down
-            num_blocks = sliding_window // block_size + 1
-            # +1 here because the last block may not be full,
-            # and so the sequence stretches one more block at the beginning
-            # For example, if sliding_window is 3 and block_size is 4,
-            # we may need 2 blocks when the second block only holds 1 token.
-            self.max_block_sliding_window = num_blocks + 1
-
-        self.watermark = watermark
-        assert watermark >= 0.0
-
-        self.enable_caching = enable_caching
-
-        self.watermark_blocks = int(watermark * num_gpu_blocks)
-
-        self.block_allocator = CpuGpuBlockAllocator.create(
-            allocator_type="prefix_caching" if enable_caching else "naive",
-            num_gpu_blocks=num_gpu_blocks,
-            num_cpu_blocks=num_cpu_blocks,
-            block_size=block_size,
-        )
-
-        self.block_tables: Dict[SeqId, BlockTable] = {}
-        self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {}
-
-        self._computed_blocks_tracker = ComputedBlocksTracker(
-            self.block_allocator, self.block_size, self.enable_caching)
-        self._last_access_blocks_tracker = LastAccessBlocksTracker(
-            self.block_allocator)
-
-    def can_allocate(self,
-                     seq_group: SequenceGroup,
-                     num_lookahead_slots: int = 0) -> AllocStatus:
-        # FIXME(woosuk): Here we assume that all sequences in the group share
-        # the same prompt. This may not be true for preempted sequences.
-
-        check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group)
-
-        seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
-        num_required_blocks = BlockTable.get_num_required_blocks(
-            seq.get_token_ids(),
-            block_size=self.block_size,
-            num_lookahead_slots=num_lookahead_slots,
-        )
-
-        if seq_group.is_encoder_decoder():
-            encoder_seq = seq_group.get_encoder_seq()
-            assert encoder_seq is not None
-            num_required_blocks += BlockTable.get_num_required_blocks(
-                encoder_seq.get_token_ids(),
-                block_size=self.block_size,
-            )
-
-        if self.max_block_sliding_window is not None:
-            num_required_blocks = min(num_required_blocks,
-                                      self.max_block_sliding_window)
-
-        num_free_gpu_blocks = self.block_allocator.get_num_free_blocks(
-            device=Device.GPU)
-
-        # Use watermark to avoid frequent cache eviction.
-        if (self.num_total_gpu_blocks - num_required_blocks
-                < self.watermark_blocks):
-            return AllocStatus.NEVER
-        if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
-            return AllocStatus.OK
-        else:
-            return AllocStatus.LATER
-
-    def _allocate_sequence(self, seq: Sequence) -> BlockTable:
-        block_table = BlockTable(
-            block_size=self.block_size,
-            block_allocator=self.block_allocator,
-            max_block_sliding_window=self.max_block_sliding_window,
-        )
-        if seq.get_token_ids():
-            # NOTE: If there are any factors affecting the block besides
-            # token_ids, they should be added as input to extra_hash.
-            extra_hash = seq.extra_hash()
-
-            # Add blocks to the block table only if the sequence is non empty.
-            block_table.allocate(token_ids=seq.get_token_ids(),
-                                 extra_hash=extra_hash)
-
-        return block_table
-
-    def allocate(self, seq_group: SequenceGroup) -> None:
-
-        # Allocate self-attention block tables for decoder sequences
-        waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
-        assert not (set(seq.seq_id for seq in waiting_seqs)
-                    & self.block_tables.keys()), "block table already exists"
-
-        # NOTE: Here we assume that all sequences in the group have the same
-        # prompt.
-        seq = waiting_seqs[0]
-        block_table: BlockTable = self._allocate_sequence(seq)
-        self.block_tables[seq.seq_id] = block_table
-
-        # Track seq
-        self._last_access_blocks_tracker.add_seq(seq.seq_id)
-
-        # Assign the block table for each sequence.
-        for seq in waiting_seqs[1:]:
-            self.block_tables[seq.seq_id] = block_table.fork()
-
-            # Track seq
-            self._last_access_blocks_tracker.add_seq(seq.seq_id)
-
-        # Allocate cross-attention block table for encoder sequence
-        #
-        # NOTE: Here we assume that all sequences in the group have the same
-        # encoder prompt.
-        request_id = seq_group.request_id
-
-        assert (request_id
-                not in self.cross_block_tables), \
-            "block table already exists"
-
-        check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group)
-
-        if seq_group.is_encoder_decoder():
-            encoder_seq = seq_group.get_encoder_seq()
-            assert encoder_seq is not None
-            block_table = self._allocate_sequence(encoder_seq)
-            self.cross_block_tables[request_id] = block_table
-
-    def can_append_slots(self, seq_group: SequenceGroup,
-                         num_lookahead_slots: int) -> bool:
-        """Determine if there is enough space in the GPU KV cache to continue
-        generation of the specified sequence group.
-
-        We use a worst-case heuristic: assume each touched block will require a
-        new allocation (either via CoW or new block). We can append slots if the
-        number of touched blocks is less than the number of free blocks.
-
-        "Lookahead slots" are slots that are allocated in addition to the slots
-        for known tokens. The contents of the lookahead slots are not defined.
-        This is used by speculative decoding when speculating future tokens.
-        """
-
-        num_touched_blocks = 0
-        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            block_table = self.block_tables[seq.seq_id]
-
-            num_touched_blocks += (
-                block_table.get_num_blocks_touched_by_append_slots(
-                    token_ids=block_table.get_unseen_token_ids(
-                        seq.get_token_ids()),
-                    num_lookahead_slots=num_lookahead_slots,
-                ))
-
-        num_free_gpu_blocks = self.block_allocator.get_num_free_blocks(
-            Device.GPU)
-        return num_touched_blocks <= num_free_gpu_blocks
-
-    def append_slots(
-        self,
-        seq: Sequence,
-        num_lookahead_slots: int,
-    ) -> List[Tuple[int, int]]:
-
-        block_table = self.block_tables[seq.seq_id]
-
-        block_table.append_token_ids(
-            token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()),
-            num_lookahead_slots=num_lookahead_slots,
-            num_computed_slots=seq.data.get_num_computed_tokens(),
-            extra_hash=seq.extra_hash(),
-        )
-        # Return any new copy-on-writes.
-        new_cows = self.block_allocator.clear_copy_on_writes()
-        return new_cows
-
-    def free(self, seq: Sequence) -> None:
-        seq_id = seq.seq_id
-
-        if seq_id not in self.block_tables:
-            # Already freed or haven't been scheduled yet.
-            return
-
-        # Update seq block ids with the latest access time
-        self._last_access_blocks_tracker.update_seq_blocks_last_access(
-            seq_id, self.block_tables[seq.seq_id].physical_block_ids)
-
-        # Untrack seq
-        self._last_access_blocks_tracker.remove_seq(seq_id)
-        self._computed_blocks_tracker.remove_seq(seq_id)
-
-        # Free table/blocks
-        self.block_tables[seq_id].free()
-        del self.block_tables[seq_id]
-
-    def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
-        seq_id = seq.seq_id
-        self._computed_blocks_tracker.remove_seq(seq_id)
-
-    def free_cross(self, seq_group: SequenceGroup) -> None:
-        request_id = seq_group.request_id
-        if request_id not in self.cross_block_tables:
-            # Already freed or hasn't been scheduled yet.
-            return
-        self.cross_block_tables[request_id].free()
-        del self.cross_block_tables[request_id]
-
-    def get_block_table(self, seq: Sequence) -> List[int]:
-        block_ids = self.block_tables[seq.seq_id].physical_block_ids
-        return block_ids  # type: ignore
-
-    def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]:
-        request_id = seq_group.request_id
-        assert request_id in self.cross_block_tables
-        block_ids = self.cross_block_tables[request_id].physical_block_ids
-        assert all(b is not None for b in block_ids)
-        return block_ids  # type: ignore
-
-    def access_all_blocks_in_seq(self, seq: Sequence, now: float):
-        if self.enable_caching:
-            # Record the latest access time for the sequence. The actual update
-            # of the block ids is deferred to the sequence free(..) call, since
-            # only during freeing of block ids, the blocks are actually added to
-            # the evictor (which is when the most updated time is required)
-            # (This avoids expensive calls to mark_blocks_as_accessed(..))
-            self._last_access_blocks_tracker.update_last_access(
-                seq.seq_id, now)
-
-    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
-                                token_chunk_size: int):
-        # If prefix caching is enabled, mark immutable blocks as computed
-        # right after they have been scheduled (for prefill). This assumes
-        # the scheduler is synchronous so blocks are actually computed when
-        # scheduling the next batch.
-        self.block_allocator.mark_blocks_as_computed([])
-
-    def get_common_computed_block_ids(
-            self, seqs: List[Sequence]) -> GenericSequence[int]:
-        """Determine which blocks for which we skip prefill.
-
-        With prefix caching we can skip prefill for previously-generated blocks.
-        Currently, the attention implementation only supports skipping cached
-        blocks if they are a contiguous prefix of cached blocks.
-
-        This method determines which blocks can be safely skipped for all
-        sequences in the sequence group.
-        """
-        computed_seq_block_ids = []
-        for seq in seqs:
-            all_blocks = self.block_tables[seq.seq_id].physical_block_ids
-            num_cached_tokens = (
-                self._computed_blocks_tracker.get_num_cached_tokens(seq))
-            assert num_cached_tokens % self.block_size == 0
-            num_cached_blocks = num_cached_tokens // self.block_size
-            computed_block_ids = all_blocks[:num_cached_blocks]
-            computed_seq_block_ids.append(computed_block_ids)
-
-        # NOTE(sang): This assumes seq_block_ids doesn't contain any None.
-        return self.block_allocator.get_common_computed_block_ids(
-            computed_seq_block_ids)  # type: ignore
-
-    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
-        if parent_seq.seq_id not in self.block_tables:
-            # Parent sequence has either been freed or never existed.
-            return
-        src_block_table = self.block_tables[parent_seq.seq_id]
-        self.block_tables[child_seq.seq_id] = src_block_table.fork()
-
-        # Track child seq
-        self._last_access_blocks_tracker.add_seq(child_seq.seq_id)
-
-    def can_swap_in(self, seq_group: SequenceGroup,
-                    num_lookahead_slots: int) -> AllocStatus:
-        """Returns the AllocStatus for the given sequence_group 
-        with num_lookahead_slots.
-
-        Args:
-            seq_group (SequenceGroup): The sequence group to swap in.
-            num_lookahead_slots (int): Number of lookahead slots used in 
-                speculative decoding, default to 0.
-
-        Returns:
-            AllocStatus: The AllocStatus for the given sequence group.
-        """
-        return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED,
-                              num_lookahead_slots)
-
-    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
-        """Returns the block id mapping (from CPU to GPU) generated by
-        swapping in the given seq_group with num_lookahead_slots.
-
-        Args:
-            seq_group (SequenceGroup): The sequence group to swap in.
-
-        Returns:
-            List[Tuple[int, int]]: The mapping of swapping block from CPU 
-                to GPU.
-        """
-        physical_block_id_mapping = []
-        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
-            blocks = self.block_tables[seq.seq_id].blocks
-            if len(blocks) == 0:
-                continue
-
-            seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
-                                                         src_device=Device.CPU,
-                                                         dst_device=Device.GPU)
-
-            # Refresh the block ids of the table (post-swap)
-            self.block_tables[seq.seq_id].update(blocks)
-
-            seq_physical_block_id_mapping = {
-                self.block_allocator.get_physical_block_id(
-                    Device.CPU, cpu_block_id):
-                self.block_allocator.get_physical_block_id(
-                    Device.GPU, gpu_block_id)
-                for cpu_block_id, gpu_block_id in seq_swap_mapping.items()
-            }
-
-            physical_block_id_mapping.extend(
-                list(seq_physical_block_id_mapping.items()))
-
-        return physical_block_id_mapping
-
-    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
-        """Returns whether we can swap out the given sequence_group 
-        with num_lookahead_slots.
-
-        Args:
-            seq_group (SequenceGroup): The sequence group to swap out.
-
-        Returns:
-            bool: Whether it's possible to swap out current sequence group.
-        """
-        alloc_status = self._can_swap(seq_group, Device.CPU,
-                                      SequenceStatus.RUNNING)
-        return alloc_status == AllocStatus.OK
-
-    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
-        """Returns the block id mapping (from GPU to CPU) generated by
-        swapping out the given sequence_group with num_lookahead_slots.
-
-        Args:
-            seq_group (SequenceGroup): The sequence group to swap out.
-
-        Returns:
-            List[Tuple[int, int]]: The mapping of swapping block from 
-                GPU to CPU.
-        """
-        physical_block_id_mapping = []
-        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            blocks = self.block_tables[seq.seq_id].blocks
-            if len(blocks) == 0:
-                continue
-
-            seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
-                                                         src_device=Device.GPU,
-                                                         dst_device=Device.CPU)
-
-            # Refresh the block ids of the table (post-swap)
-            self.block_tables[seq.seq_id].update(blocks)
-
-            seq_physical_block_id_mapping = {
-                self.block_allocator.get_physical_block_id(
-                    Device.GPU, gpu_block_id):
-                self.block_allocator.get_physical_block_id(
-                    Device.CPU, cpu_block_id)
-                for gpu_block_id, cpu_block_id in seq_swap_mapping.items()
-            }
-
-            physical_block_id_mapping.extend(
-                list(seq_physical_block_id_mapping.items()))
-
-        return physical_block_id_mapping
-
-    def get_num_free_gpu_blocks(self) -> int:
-        return self.block_allocator.get_num_free_blocks(Device.GPU)
-
-    def get_num_free_cpu_blocks(self) -> int:
-        return self.block_allocator.get_num_free_blocks(Device.CPU)
-
-    def get_prefix_cache_hit_rate(self, device: Device) -> float:
-        return self.block_allocator.get_prefix_cache_hit_rate(device)
-
-    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
-        return self.block_allocator.reset_prefix_cache(device)
-
-    def _can_swap(self,
-                  seq_group: SequenceGroup,
-                  device: Device,
-                  status: SequenceStatus,
-                  num_lookahead_slots: int = 0) -> AllocStatus:
-        """Returns the AllocStatus for swapping in/out the given sequence_group 
-        on to the 'device'.
-
-        Args:
-            seq_group (SequenceGroup): The sequence group to swap in/out.
-            device (Device): device to swap the 'seq_group' on.
-            status (SequenceStatus): The status of sequence which is needed
-                for action. RUNNING for swap out and SWAPPED for swap in
-            num_lookahead_slots (int): Number of lookahead slots used in 
-                speculative decoding, default to 0.
-
-        Returns:
-            AllocStatus: The AllocStatus for swapping in/out the given 
-                sequence_group on to the 'device'.
-        """
-        # First determine the number of blocks that will be touched by this
-        # swap. Then verify if there are available blocks in the device
-        # to perform the swap.
-        num_blocks_touched = 0
-        blocks: List[Block] = []
-        for seq in seq_group.get_seqs(status=status):
-            block_table = self.block_tables[seq.seq_id]
-            if block_table.blocks is not None:
-                # Compute the number blocks to touch for the tokens to be
-                # appended. This does NOT include the full blocks that need
-                # to be touched for the swap.
-                num_blocks_touched += \
-                    block_table.get_num_blocks_touched_by_append_slots(
-                        block_table.get_unseen_token_ids(seq.get_token_ids()),
-                        num_lookahead_slots=num_lookahead_slots)
-                blocks.extend(block_table.blocks)
-        # Compute the number of full blocks to touch and add it to the
-        # existing count of blocks to touch.
-        num_blocks_touched += self.block_allocator.get_num_full_blocks_touched(
-            blocks, device=device)
-
-        watermark_blocks = 0
-        if device == Device.GPU:
-            watermark_blocks = self.watermark_blocks
-
-        if self.block_allocator.get_num_total_blocks(
-                device) < num_blocks_touched:
-            return AllocStatus.NEVER
-        elif self.block_allocator.get_num_free_blocks(
-                device) - num_blocks_touched >= watermark_blocks:
-            return AllocStatus.OK
-        else:
-            return AllocStatus.LATER
-
-    def get_num_cached_tokens(self, seq: Sequence) -> int:
-        """Get the number of tokens in blocks that are already computed and
-        cached in the block manager for the sequence.
-        """
-        return self._computed_blocks_tracker.get_num_cached_tokens(seq)
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@ -1,157 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import enum
-import heapq
-from abc import ABC, abstractmethod
-from typing import Dict, List, Tuple
-
-
-class EvictionPolicy(enum.Enum):
-    """Enum for eviction policy used by make_evictor to instantiate the correct
-       Evictor subclass.
-    """
-    LRU = enum.auto()
-
-
-class Evictor(ABC):
-    """The Evictor subclasses should be used by the BlockAllocator class to
-    handle eviction of freed Blocks.
-    """
-
-    @abstractmethod
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def __contains__(self, block_id: int) -> bool:
-        pass
-
-    @abstractmethod
-    def evict(self) -> Tuple[int, int]:
-        """Runs the eviction algorithm and returns the evicted block's
-        content hash along with physical block id along with physical block id
-        """
-        pass
-
-    @abstractmethod
-    def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
-            last_accessed: float):
-        """Adds block to the evictor, making it a candidate for eviction"""
-        pass
-
-    @abstractmethod
-    def update(self, block_id: int, last_accessed: float):
-        """Update corresponding block's access time in metadata"""
-        pass
-
-    @abstractmethod
-    def remove(self, block_id: int):
-        """Remove a given block id from the cache."""
-        pass
-
-    @property
-    @abstractmethod
-    def num_blocks(self) -> int:
-        pass
-
-
-class BlockMetaData:
-    """Data structure for storing key data describe cached block, so that
-    evictor could use to make its decision which one to choose for eviction
-
-    Here we use physical block id as the dict key, as there maybe several
-    blocks with the same content hash, but their physical id is unique.
-    """
-
-    def __init__(self, content_hash: int, num_hashed_tokens: int,
-                 last_accessed: float):
-        self.content_hash = content_hash
-        self.num_hashed_tokens = num_hashed_tokens
-        self.last_accessed = last_accessed
-
-
-class LRUEvictor(Evictor):
-    """Evicts in a least-recently-used order using the last_accessed timestamp
-    that's recorded in the Block. If there are multiple blocks with
-    the same last_accessed time, then the one with the largest num_hashed_tokens
-    will be evicted. If two blocks each have the lowest last_accessed time and
-    highest num_hashed_tokens value, then one will be chosen arbitrarily
-    """
-
-    # CLEANUP_THRESHOLD determines the maximum allowable size of the priority
-    # queue relative to the free table size. When this threshold is exceeded,
-    # a cleanup operation is triggered to reduce memory usage.
-    CLEANUP_THRESHOLD = 50
-
-    def __init__(self):
-        self.free_table: Dict[int, BlockMetaData] = {}
-        self.priority_queue = []
-
-    def __contains__(self, block_id: int) -> bool:
-        return block_id in self.free_table
-
-    def evict(self) -> Tuple[int, int]:
-        if len(self.free_table) == 0:
-            raise ValueError("No usable cache memory left")
-
-        while self.priority_queue:
-            # We do not remove outdated entries from the priority queue at the
-            # time of updating the last_accessed timestamp. Instead, outdated
-            # entries are filtered out here during eviction. Outdated entries
-            # would either not in the free table, or have older last accessed
-            # time.
-            last_accessed, _, block_id, content_hash = heapq.heappop(
-                self.priority_queue)
-            if (block_id in self.free_table and
-                    self.free_table[block_id].last_accessed == last_accessed):
-                self.free_table.pop(block_id)
-                return block_id, content_hash
-
-        raise ValueError("No usable cache memory left")
-
-    def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
-            last_accessed: float):
-        self.free_table[block_id] = BlockMetaData(content_hash,
-                                                  num_hashed_tokens,
-                                                  last_accessed)
-        heapq.heappush(
-            self.priority_queue,
-            (last_accessed, -num_hashed_tokens, block_id, content_hash))
-        self._cleanup_if_necessary()
-
-    def update(self, block_id: int, last_accessed: float):
-        self.free_table[block_id].last_accessed = last_accessed
-
-    def _cleanup_if_necessary(self):
-        if len(self.priority_queue) > LRUEvictor.CLEANUP_THRESHOLD * len(
-                self.free_table):
-            self._cleanup()
-
-    def _cleanup(self):
-        new_priority_queue: List[Tuple[float, int, int, int]] = []
-
-        for block_id, block in self.free_table.items():
-            new_priority_queue.append(
-                (block.last_accessed, -block.num_hashed_tokens, block_id,
-                 block.content_hash))
-        heapq.heapify(new_priority_queue)
-
-        self.priority_queue = new_priority_queue
-
-    def remove(self, block_id: int):
-        if block_id not in self.free_table:
-            raise ValueError(
-                "Attempting to remove block that's not in the evictor")
-        self.free_table.pop(block_id)
-
-    @property
-    def num_blocks(self) -> int:
-        return len(self.free_table)
-
-
-def make_evictor(eviction_policy: EvictionPolicy) -> Evictor:
-    if eviction_policy == EvictionPolicy.LRU:
-        return LRUEvictor()
-    else:
-        raise ValueError(f"Unknown cache eviction policy: {eviction_policy}")
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@ -1,139 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import enum
-from abc import ABC, abstractmethod
-from typing import List, Optional
-from typing import Sequence as GenericSequence
-from typing import Tuple
-
-from vllm.sequence import Sequence, SequenceGroup
-from vllm.utils import Device
-
-
-class AllocStatus(enum.Enum):
-    """Result for BlockSpaceManager.can_allocate
-
-    1. Ok: seq_group can be allocated now.
-    2. Later: seq_group cannot be allocated.
-      The capacity of allocator is larger than seq_group required.
-    3. Never: seq_group can never be allocated.
-      The seq_group is too large to allocated in GPU.
-    """
-    OK = enum.auto()
-    LATER = enum.auto()
-    NEVER = enum.auto()
-
-
-class BlockSpaceManager(ABC):
-
-    @staticmethod
-    def get_block_space_manager_class(version: str):
-        version = version.lower()
-
-        if version == "selfattn":
-            from vllm.core.block_manager import SelfAttnBlockSpaceManager
-            return SelfAttnBlockSpaceManager
-
-        if version == "placeholder":
-            from vllm.core.placeholder_block_space_manager import (
-                PlaceholderBlockSpaceManager)
-            return PlaceholderBlockSpaceManager
-
-        raise ValueError(f"Unknown version {version=}")
-
-    @abstractmethod
-    def can_allocate(self,
-                     seq_group: SequenceGroup,
-                     num_lookahead_slots: int = 0) -> AllocStatus:
-        pass
-
-    @abstractmethod
-    def allocate(self, seq_group: SequenceGroup) -> None:
-        pass
-
-    @abstractmethod
-    def can_append_slots(self, seq_group: SequenceGroup,
-                         num_lookahead_slots: int) -> bool:
-        pass
-
-    @abstractmethod
-    def append_slots(
-        self,
-        seq: Sequence,
-        num_lookahead_slots: int,
-    ) -> List[Tuple[int, int]]:
-        pass
-
-    @abstractmethod
-    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
-        pass
-
-    @abstractmethod
-    def can_swap_in(self, seq_group: SequenceGroup,
-                    num_lookahead_slots: int) -> AllocStatus:
-        pass
-
-    @abstractmethod
-    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
-        pass
-
-    @abstractmethod
-    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
-        pass
-
-    @abstractmethod
-    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
-        pass
-
-    @abstractmethod
-    def free(self, seq: Sequence) -> None:
-        pass
-
-    @abstractmethod
-    def get_block_table(self, seq: Sequence) -> List[int]:
-        pass
-
-    @abstractmethod
-    def get_num_free_gpu_blocks(self) -> int:
-        pass
-
-    @abstractmethod
-    def get_num_free_cpu_blocks(self) -> int:
-        pass
-
-    @abstractmethod
-    def access_all_blocks_in_seq(
-        self,
-        seq: Sequence,
-        access_time: float,
-    ) -> None:
-        pass
-
-    @abstractmethod
-    def get_common_computed_block_ids(
-            self, seqs: List[Sequence]) -> GenericSequence[int]:
-        pass
-
-    @abstractmethod
-    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
-                                token_chunk_size: int):
-        pass
-
-    @abstractmethod
-    def get_prefix_cache_hit_rate(self, device: Device) -> float:
-        """Prefix cache hit rate. -1 means not supported or disabled."""
-        pass
-
-    @abstractmethod
-    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
-        """Reset prefix cache for specified or all devices."""
-        pass
-
-    @abstractmethod
-    def get_num_cached_tokens(self, seq: Sequence) -> int:
-        pass
-
-    @abstractmethod
-    def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
-        pass
--- a/vllm/core/placeholder_block_space_manager.py
+++ b/vllm/core/placeholder_block_space_manager.py
@ -1,103 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import List, Optional, Tuple
-
-from vllm.core.interfaces import AllocStatus, BlockSpaceManager
-from vllm.sequence import Sequence, SequenceGroup
-from vllm.utils import Device
-
-
-class PlaceholderBlockSpaceManager(BlockSpaceManager):
-    """A version of BlockSpaceManager for use in environments
-    where block management is not required. 
-    For example: pooling models or attention-free models like Mamba.
-
-    This class provides the same interface as BlockSpaceManager, but its
-    methods perform no actions or return simple values like True in specific
-    actions. It's designed to be used in scenarios where the overhead of
-    block management is unnecessary, such as in an embedding environment.
-    """
-
-    def __init__(
-        self,
-        **kwargs,
-    ) -> None:
-        pass
-
-    def can_allocate(self,
-                     seq_group: SequenceGroup,
-                     num_lookahead_slots: int = 0) -> AllocStatus:
-        # Always return OK for dummy purposes
-        return AllocStatus.OK
-
-    def allocate(self, seq_group: SequenceGroup) -> None:
-        # No actual allocation logic needed
-        pass
-
-    def can_append_slots(self, seq_group: SequenceGroup,
-                         num_lookahead_slots: int) -> bool:
-        return True
-
-    def append_slots(
-        self,
-        seq: Sequence,
-        num_lookahead_slots: int,
-    ) -> List[Tuple[int, int]]:
-        return []
-
-    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
-        pass
-
-    def can_swap_in(self, seq_group: SequenceGroup,
-                    num_lookahead_slots: int) -> AllocStatus:
-        return AllocStatus.OK
-
-    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
-        return None  # type: ignore
-
-    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
-        return True
-
-    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
-        return None  # type: ignore
-
-    def free(self, seq: Sequence) -> None:
-        # No operation on free
-        return
-
-    def get_block_table(self, seq: Sequence) -> List[int]:
-        return None  # type: ignore
-
-    def get_num_free_gpu_blocks(self) -> int:
-        return 1
-
-    def get_num_free_cpu_blocks(self) -> int:
-        return 1
-
-    def access_all_blocks_in_seq(
-        self,
-        seq: Sequence,
-        access_time: float,
-    ) -> None:
-        pass
-
-    def get_common_computed_block_ids(self,
-                                      seq_group: List[Sequence]) -> List[int]:
-        return []
-
-    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
-                                token_chunk_size: int):
-        pass
-
-    def get_prefix_cache_hit_rate(self, device: Device) -> float:
-        return -1
-
-    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
-        return True
-
-    def get_num_cached_tokens(self, seq: Sequence) -> int:
-        return 0
-
-    def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
-        return
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -41,7 +41,8 @@ from vllm.plugins import load_general_plugins
 from vllm.ray.lazy_utils import is_ray_initialized
 from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
-from vllm.transformers_utils.config import get_model_path, is_interleaved
+from vllm.transformers_utils.config import (get_model_path, is_interleaved,
+                                            maybe_override_with_speculators)
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
                        GiB_bytes, get_ip, is_in_ray_actor)
@ -409,9 +410,7 @@ class EngineArgs:
        get_field(LoadConfig, "model_loader_extra_config")
    ignore_patterns: Optional[Union[str,
                                    List[str]]] = LoadConfig.ignore_patterns
-    preemption_mode: Optional[str] = SchedulerConfig.preemption_mode

-    scheduler_delay_factor: float = SchedulerConfig.delay_factor
    enable_chunked_prefill: Optional[
        bool] = SchedulerConfig.enable_chunked_prefill
    disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
@ -439,7 +438,6 @@ class EngineArgs:
        ObservabilityConfig.otlp_traces_endpoint
    collect_detailed_traces: Optional[list[DetailedTraceModules]] = \
        ObservabilityConfig.collect_detailed_traces
-    disable_async_output_proc: bool = not ModelConfig.use_async_output_proc
    scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
    scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls

@ -561,14 +559,6 @@ class EngineArgs:
                                 **model_kwargs["enable_prompt_embeds"])
        model_group.add_argument("--served-model-name",
                                 **model_kwargs["served_model_name"])
-        # This one is a special case because it is the
-        # opposite of ModelConfig.use_async_output_proc
-        model_group.add_argument(
-            "--disable-async-output-proc",
-            action="store_true",
-            default=EngineArgs.disable_async_output_proc,
-            help="Disable async output processing. This may result in "
-            "lower performance.")
        model_group.add_argument("--config-format",
                                 **model_kwargs["config_format"])
        # This one is a special case because it can bool
@ -897,10 +887,6 @@ class EngineArgs:
            **scheduler_kwargs["long_prefill_token_threshold"])
        scheduler_group.add_argument("--num-lookahead-slots",
                                     **scheduler_kwargs["num_lookahead_slots"])
-        scheduler_group.add_argument("--scheduler-delay-factor",
-                                     **scheduler_kwargs["delay_factor"])
-        scheduler_group.add_argument("--preemption-mode",
-                                     **scheduler_kwargs["preemption_mode"])
        # multi-step scheduling has been removed; corresponding arguments
        # are no longer supported.
        scheduler_group.add_argument("--scheduling-policy",
@ -1029,7 +1015,6 @@ class EngineArgs:
            interleave_mm_strings=self.interleave_mm_strings,
            media_io_kwargs=self.media_io_kwargs,
            skip_mm_profiling=self.skip_mm_profiling,
-            use_async_output_proc=not self.disable_async_output_proc,
            config_format=self.config_format,
            mm_processor_kwargs=self.mm_processor_kwargs,
            mm_processor_cache_gb=self.mm_processor_cache_gb,
@ -1098,29 +1083,8 @@ class EngineArgs:
        provided as a JSON string input via CLI arguments or directly as a
        dictionary from the engine.
        """
-
-        from vllm.transformers_utils.config import get_config
-        from vllm.transformers_utils.configs.speculators.base import (
-            SpeculatorsConfig)
-
        if self.speculative_config is None:
-            hf_config = get_config(
-                self.hf_config_path or target_model_config.model,
-                self.trust_remote_code, self.revision, self.code_revision,
-                self.config_format)
-
-            # if loading a SpeculatorsConfig, load the speculative_config
-            # details from the config directly
-            # no user input required / expected
-            if isinstance(hf_config, SpeculatorsConfig):
-                # We create one since we don't create one
-                self.speculative_config = {}
-                self.speculative_config[
-                    "num_speculative_tokens"] = hf_config.num_lookahead_tokens
-                self.speculative_config["model"] = target_model_config.model
-                self.speculative_config["method"] = hf_config.method
-            else:
-                return None
+            return None

        # Note(Shangming): These parameters are not obtained from the cli arg
        # '--speculative-config' and must be passed in when creating the engine
@ -1155,6 +1119,15 @@ class EngineArgs:

        device_config = DeviceConfig(
            device=cast(Device, current_platform.device_type))
+
+        (self.model, self.tokenizer,
+         self.speculative_config) = maybe_override_with_speculators(
+             model=self.model,
+             tokenizer=self.tokenizer,
+             revision=self.revision,
+             trust_remote_code=self.trust_remote_code,
+             vllm_speculative_config=self.speculative_config,
+         )
        model_config = self.create_model_config()

        # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
@ -1395,11 +1368,9 @@ class EngineArgs:
            max_model_len=model_config.max_model_len,
            cuda_graph_sizes=self.cuda_graph_sizes,
            num_lookahead_slots=num_lookahead_slots,
-            delay_factor=self.scheduler_delay_factor,
            enable_chunked_prefill=self.enable_chunked_prefill,
            disable_chunked_mm_input=self.disable_chunked_mm_input,
            is_multimodal_model=model_config.is_multimodal_model,
-            preemption_mode=self.preemption_mode,
            send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                             and parallel_config.use_ray),
            policy=self.scheduling_policy,
@ -1486,42 +1457,12 @@ class EngineArgs:
        #############################################################
        # Unsupported Feature Flags on V1.

-        if self.load_format == "sharded_state":
-            _raise_or_fallback(
-                feature_name=f"--load_format {self.load_format}",
-                recommend_to_remove=False)
-            return False
-
        if (self.logits_processor_pattern
                != EngineArgs.logits_processor_pattern):
            _raise_or_fallback(feature_name="--logits-processor-pattern",
                               recommend_to_remove=False)
            return False

-        if self.preemption_mode != SchedulerConfig.preemption_mode:
-            _raise_or_fallback(feature_name="--preemption-mode",
-                               recommend_to_remove=True)
-            return False
-
-        if (self.disable_async_output_proc
-                != EngineArgs.disable_async_output_proc):
-            _raise_or_fallback(feature_name="--disable-async-output-proc",
-                               recommend_to_remove=True)
-            return False
-
-        if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
-            _raise_or_fallback(feature_name="--scheduler-delay-factor",
-                               recommend_to_remove=True)
-            return False
-
-        if self.kv_cache_dtype != "auto":
-            supported = current_platform.is_kv_cache_dtype_supported(
-                self.kv_cache_dtype, model_config)
-            if not supported:
-                _raise_or_fallback(feature_name="--kv-cache-dtype",
-                                   recommend_to_remove=False)
-                return False
-
        # No Mamba or Encoder-Decoder so far.
        if not model_config.is_v1_compatible:
            _raise_or_fallback(feature_name=model_config.architectures,
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
--- a/vllm/engine/output_processor/init.py
+++ b/vllm/engine/output_processor/init.py
--- a/vllm/engine/output_processor/interfaces.py
+++ b/vllm/engine/output_processor/interfaces.py
@ -1,59 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from abc import ABC, abstractmethod
-from typing import List
-
-from vllm.config import SchedulerConfig
-from vllm.core.scheduler import Scheduler
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.sequence import SequenceGroup, SequenceGroupOutput
-from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.utils import Counter
-
-
-class SequenceGroupOutputProcessor(ABC):
-    """Interface for logic that processes new token ids in sequence groups,
-    managing detokenization, stop checking, and freeing/forking sequences with
-    the scheduler.
-
-    This is highly coupled with the LLMEngine and should be seen as an extension
-    of it. The logic is separated to simplify the LLMEngine class and allow
-    separate implementations for single-step decoding (which supports beam
-    search sequence forking) and multi-step decoding (which does not support
-    beam search, but does support speculative decoding).
-    """
-
-    @staticmethod
-    def create_output_processor(
-        scheduler_config: SchedulerConfig,
-        detokenizer: Detokenizer,
-        scheduler: List[Scheduler],
-        seq_counter: Counter,
-        stop_checker: "StopChecker",
-    ):
-        """Create an output processor.
-
-        Multi-step scheduling is no longer supported. Always return a
-        single-step output processor.
-        """
-        from vllm.engine.output_processor.single_step import (
-            SingleStepOutputProcessor)
-        return SingleStepOutputProcessor(scheduler_config, detokenizer,
-                                         scheduler, seq_counter, stop_checker)
-
-    @abstractmethod
-    def process_outputs(self, sequence_group: SequenceGroup,
-                        outputs: List[SequenceGroupOutput],
-                        is_async: bool) -> None:
-        """Process new token ids for the sequence group. Handles logic such as
-        detokenization, stop checking, and freeing/forking sequences in the
-        scheduler.
-        """
-        pass
-
-    @abstractmethod
-    def process_prompt_logprob(self, seq_group: SequenceGroup,
-                               outputs: List[SequenceGroupOutput]) -> None:
-        """Update prompt logprobs received from outputs to seq_group."""
-        pass
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@ -1,145 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import List
-
-from vllm.config import SchedulerConfig
-from vllm.core.scheduler import Scheduler
-from vllm.engine.output_processor.interfaces import (
-    SequenceGroupOutputProcessor)
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.logger import init_logger
-from vllm.sequence import (CompletionSequenceGroupOutput, SequenceGroup,
-                           SequenceGroupOutput)
-from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.utils import Counter
-
-logger = init_logger(__name__)
-
-
-def single_step_process_prompt_logprob(
-        sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
-        output: CompletionSequenceGroupOutput) -> None:
-    """Process prompt logprobs associated with the
-    [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] for a given step.
-
-    Do nothing if the output has no prompt logprobs.
-
-    Account for the fact that transformers do not compute first-token logprobs.
-    
-    Args:
-      sg_output_proc:
-          [`SequenceGroupOutputProcessor`][vllm.engine.output_processor.interfaces.SequenceGroupOutputProcessor]
-          instance
-      seq_group: the output is associated with this
-          [`SequenceGroup`][vllm.sequence.SequenceGroup]
-      output: the [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
-          for a single scheduler step
-    """
-    prompt_logprobs = output.prompt_logprobs
-
-    # If this is the first (or only) "chunk" of the prefill, we need
-    # to prepend None to the list of prompt logprobs. The reason for this
-    # is that for N prompt tokens, the Sampler will generate N-1 total
-    # prompt logprobs during prefill since the token at idx 0 will not
-    # have a logprob associated with it.
-    if prompt_logprobs is not None:
-        if not seq_group.prompt_logprobs:
-            prompt_logprobs = [None] + prompt_logprobs
-            seq_group.prompt_logprobs = []
-
-        assert hasattr(sg_output_proc, 'detokenizer')
-        if (seq_group.sampling_params.detokenize
-                and sg_output_proc.detokenizer):
-            sg_output_proc.detokenizer.decode_prompt_logprobs_inplace(
-                seq_group,
-                prompt_logprobs,
-                position_offset=len(seq_group.prompt_logprobs))
-
-        seq_group.prompt_logprobs.extend(prompt_logprobs)
-
-
-class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
-    """SequenceGroupOutputProcessor which handles "output processing" logic,
-    which happens after the model returns generated token ids and before
-    scheduling of the next batch. Output processing logic includes
-    detokenization, and determining if a sequence is finished (e.g. via max len
-    or eos token).
-
-    The SingleStepOutputProcessor is specialized to the case where the model
-    emits at most a single token per invocation, which precludes configurations
-    such as speculative decoding or multi-step decoding. This enables beam
-    search sampling, which requires forking/finishing/freeing sequences in a way
-    that is currently difficult to schedule multiple steps ahead of time.
-    """
-
-    def __init__(self, scheduler_config: SchedulerConfig,
-                 detokenizer: Detokenizer, scheduler: List[Scheduler],
-                 seq_counter: Counter, stop_checker: StopChecker):
-        self.scheduler_config = scheduler_config
-        self.detokenizer = detokenizer
-        self.scheduler = scheduler
-        self.seq_counter = seq_counter
-        self.stop_checker = stop_checker
-
-    def process_outputs(self, sequence_group: SequenceGroup,
-                        outputs: List[SequenceGroupOutput],
-                        is_async: bool) -> None:
-        """Append all new tokens to sequences in the sequence group. Fork any
-        surviving beam candidates; free any unsurviving ones.
-
-        Invokes detokenizer to detokenize new tokens, and also marks sequences
-        as finished if they meet stop conditions.
-        
-        is_async - Indicates whether this postprocessor runs in 
-            parallel with the GPU forward pass and is processing 
-            tokens from the previous step. If this is true, then
-            no tokens need to be appended since it is already done
-            externally (before the next schedule() call)
-        """
-        assert (len(outputs) == 1
-                ), f"{type(self)} does not support multiple outputs per step"
-        return self._process_sequence_group_outputs(sequence_group, outputs[0],
-                                                    is_async)
-
-    def process_prompt_logprob(self, seq_group: SequenceGroup,
-                               outputs: List[SequenceGroupOutput]) -> None:
-        """Process prompt logprobs associated with one step of a single-step-
-        scheduled computation.
-        
-        Args:
-          seq_group: the output is associated with this
-              [`SequenceGroup`][vllm.sequence.SequenceGroup]
-          outputs: the
-              [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
-              for a single scheduler step
-        """
-        assert len(outputs) == 1, "Single step should only have 1 output."
-        output = outputs[0]
-        assert isinstance(output, CompletionSequenceGroupOutput)
-        single_step_process_prompt_logprob(self, seq_group, output)
-
-    def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
-                                        outputs: SequenceGroupOutput,
-                                        is_async: bool) -> None:
-        sampling_params = seq_group.sampling_params
-
-        sample = outputs.samples[0]
-        seq = seq_group.first_seq
-        if not is_async:
-            seq.append_token_id(sample.output_token, sample.logprobs,
-                                sample.output_embed)
-        if sampling_params.detokenize and self.detokenizer:
-            new_char_count = self.detokenizer.decode_sequence_inplace(
-                seq, sampling_params)
-        else:
-            new_char_count = 0
-        self.stop_checker.maybe_stop_sequence(
-            seq,
-            new_char_count,
-            sampling_params,
-            lora_req=seq_group.lora_request,
-        )
-        if seq.is_finished():
-            for scheduler in self.scheduler:
-                scheduler.free_seq(seq)
--- a/vllm/engine/output_processor/stop_checker.py
+++ b/vllm/engine/output_processor/stop_checker.py
@ -1,139 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import List, Optional, Tuple
-
-from vllm.lora.request import LoRARequest
-from vllm.reasoning import ReasoningParser
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import Sequence, SequenceStatus
-
-
-class StopChecker:
-    """LLMEngine helper class which separates out the logic involving stop
-    checking. This checks things such as: whether the eos token was emitted,
-    whether the max_tokens has been consumed, whether a stop string has been
-    emitted, or if we have exceeded the max model len.
-    """
-
-    def __init__(
-        self,
-        max_model_len: int,
-        reasoner: Optional[ReasoningParser] = None,
-    ):
-        # Do not use it directly, but use `self._get_max_model_len`.
-        self._max_model_len = max_model_len
-        self.reasoner = reasoner
-
-    def _get_max_model_len(self, lora_req: Optional[LoRARequest]):
-        if lora_req and lora_req.long_lora_max_len:
-            return lora_req.long_lora_max_len
-        else:
-            return self._max_model_len
-
-    def maybe_stop_sequence(
-        self,
-        seq: Sequence,
-        new_char_count: int,
-        sampling_params: SamplingParams,
-        lora_req: Optional[LoRARequest] = None,
-    ) -> None:
-        """Stop the finished sequences.
-
-       new_char_count is the number of chars added to the
-           sequence's output text for the newly generated token
-        """
-
-        # Check if the minimum number of tokens has been generated yet;
-        # skip the stop string/token checks if not
-        if seq.get_output_len() < sampling_params.min_tokens:
-            return
-
-        # Check if the sequence has generated the EOS token.
-        if ((not sampling_params.ignore_eos)
-                and seq.get_last_token_id() == seq.eos_token_id):
-            # Remove the last EOS token unless explicitly specified
-            # This prevents unintended exposure of the EOS token
-            if new_char_count and (
-                    not sampling_params.include_stop_str_in_output):
-                seq.output_text = seq.output_text[:-new_char_count]
-            seq.status = SequenceStatus.FINISHED_STOPPED
-            return
-
-        # Skip stop string/token checks if in reasoning content generation
-        if self.reasoner is not None and \
-            not self.reasoner.is_reasoning_end(seq.get_token_ids()):
-            return
-
-        # Check if a stop token was encountered.
-        # This assumes a single token produced per step.
-        last_token_id = seq.get_last_token_id()
-        if last_token_id in (sampling_params.stop_token_ids or ()):
-            if new_char_count and (
-                    not sampling_params.include_stop_str_in_output):
-                # Remove last token
-                seq.output_text = seq.output_text[:-new_char_count]
-            seq.status = SequenceStatus.FINISHED_STOPPED
-            seq.stop_reason = last_token_id
-            return
-
-        # Check if any stop strings are matched.
-        stop = self.check_stop_strings(
-            seq.output_text, new_char_count, sampling_params.stop,
-            sampling_params.include_stop_str_in_output)
-        if stop is not None:
-            stop_str, truncate_to = stop
-            if truncate_to != -1:
-                seq.output_text = seq.output_text[:truncate_to]
-            seq.status = SequenceStatus.FINISHED_STOPPED
-            seq.stop_reason = stop_str
-            return
-
-        # Check if the sequence has reached max_model_len.
-        if seq.get_len() >= self._get_max_model_len(lora_req):
-            seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
-            return
-
-        # Check if the sequence has reached max_tokens.
-        if seq.get_output_len() == sampling_params.max_tokens:
-            seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
-            return
-
-    @staticmethod
-    def check_stop_strings(
-        output_text: str,
-        new_char_count: int,
-        stop: List[str],
-        include_in_output: bool,
-    ) -> Optional[Tuple[str, int]]:
-        """Check if any stop strings are matched and truncate sequence
-        output text accordingly.
-
-        Returns tuple (stop_string, offset) if matched or else None.
-
-        Where stop_string is the matched stop string and offset is the
-        length to which output_text should be truncated, or -1 for no
-        truncation.
-        """
-        if not new_char_count or not stop:
-            return None
-
-        for stop_str in stop:
-            stop_string_len = len(stop_str)
-            # Avoid searching already-searched text.
-            stop_index = output_text.find(stop_str,
-                                          1 - new_char_count - stop_string_len)
-            if stop_index == -1:
-                continue
-
-            if include_in_output:
-                # Truncate to end of stop string.
-                stop_index += stop_string_len
-                if stop_index >= len(output_text):
-                    # No truncation required.
-                    return stop_str, -1
-
-            # Truncate the output text to either the beginning
-            # or end of the stop string.
-            return stop_str, stop_index
-        return None
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@ -7,13 +7,11 @@ from typing import Any, AsyncGenerator, Iterable, Mapping, Optional, Union

 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
 from vllm.config import ModelConfig, VllmConfig
-from vllm.core.scheduler import SchedulerOutputs
 from vllm.inputs.data import PromptType, TokensPrompt
 from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
 from vllm.plugins.io_processors.interface import IOProcessor
 from vllm.pooling_params import PoolingParams
@ -266,11 +264,7 @@ class EngineClient(ABC):
        ...

    @abstractmethod
-    async def do_log_stats(
-        self,
-        scheduler_outputs: Optional[SchedulerOutputs] = None,
-        model_output: Optional[list[SamplerOutput]] = None,
-    ) -> None:
+    async def do_log_stats(self) -> None:
        ...

    @abstractmethod
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@ -421,6 +421,51 @@ def resolve_mistral_chat_template(
    return None


+_PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], Optional[str]]()
+"""
+Used in `_try_get_processor_chat_template` to avoid calling
+`cached_get_processor` again if the processor fails to be loaded.
+
+This is needed because `lru_cache` does not cache when an exception happens.
+"""
+
+
+def _try_get_processor_chat_template(
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    model_config: ModelConfig,
+) -> Optional[str]:
+    cache_key = (tokenizer.name_or_path, model_config.trust_remote_code)
+    if cache_key in _PROCESSOR_CHAT_TEMPLATES:
+        return _PROCESSOR_CHAT_TEMPLATES[cache_key]
+
+    try:
+        processor = cached_get_processor(
+            tokenizer.name_or_path,
+            processor_cls=(
+                PreTrainedTokenizer,
+                PreTrainedTokenizerFast,
+                ProcessorMixin,
+            ),
+            trust_remote_code=model_config.trust_remote_code,
+        )
+        if (
+            isinstance(processor, ProcessorMixin)
+            and hasattr(processor, "chat_template")
+            and (chat_template := processor.chat_template) is not None
+        ):
+            _PROCESSOR_CHAT_TEMPLATES[cache_key] = chat_template
+            return chat_template
+    except Exception:
+        logger.debug(
+            "Failed to load AutoProcessor chat template for %s",
+            tokenizer.name_or_path,
+            exc_info=True,
+        )
+
+    _PROCESSOR_CHAT_TEMPLATES[cache_key] = None
+    return None
+
+
 def resolve_hf_chat_template(
    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
    chat_template: Optional[str],
@ -434,28 +479,10 @@ def resolve_hf_chat_template(

    # 2nd priority: AutoProcessor chat template, unless tool calling is enabled
    if tools is None:
-        try:
-            processor = cached_get_processor(
-                tokenizer.name_or_path,
-                processor_cls=(
-                    PreTrainedTokenizer,
-                    PreTrainedTokenizerFast,
-                    ProcessorMixin,
-                ),
-                trust_remote_code=model_config.trust_remote_code,
-            )
-            if (
-                isinstance(processor, ProcessorMixin)
-                and hasattr(processor, "chat_template")
-                and processor.chat_template is not None
-            ):
-                return processor.chat_template
-        except Exception:
-            logger.debug(
-                "Failed to load AutoProcessor chat template for %s",
-                tokenizer.name_or_path,
-                exc_info=True,
-            )  # noqa: E501
+        chat_template = _try_get_processor_chat_template(tokenizer,
+                                                         model_config)
+        if chat_template is not None:
+            return chat_template

    # 3rd priority: AutoTokenizer chat template
    try:
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -11,7 +11,6 @@ from pydantic import ValidationError
 from tqdm.auto import tqdm
 from typing_extensions import TypeVar

-import vllm.envs as envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                              BeamSearchSequence,
                              create_sort_beams_key_function)
@ -19,7 +18,6 @@ from vllm.config import (CompilationConfig, ModelDType,
                         StructuredOutputsConfig, TokenizerMode, is_init_field)
 from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
                                   PoolerConfig, RunnerOption)
-from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                         ChatTemplateContentFormatOption,
                                         apply_hf_chat_template,
@ -54,6 +52,7 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                               get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, Device, as_iter, is_list_of
+from vllm.v1.engine.llm_engine import LLMEngine
 from vllm.v1.sample.logits_processor import LogitsProcessor

 if TYPE_CHECKING:
@ -138,8 +137,6 @@ class LLM:
            back to the eager mode.
        disable_custom_all_reduce: See
            [ParallelConfig][vllm.config.ParallelConfig].
-        disable_async_output_proc: Disable async output processing.
-            This may result in lower performance.
        hf_token: The token to use as HTTP bearer authorization for remote files
            . If `True`, will use the token generated when running
            `huggingface-cli login` (stored in `~/.huggingface`).
@ -189,7 +186,6 @@ class LLM:
        enforce_eager: bool = False,
        max_seq_len_to_capture: int = 8192,
        disable_custom_all_reduce: bool = False,
-        disable_async_output_proc: bool = False,
        hf_token: Optional[Union[bool, str]] = None,
        hf_overrides: Optional[HfOverrides] = None,
        mm_processor_kwargs: Optional[dict[str, Any]] = None,
@ -287,7 +283,6 @@ class LLM:
            enforce_eager=enforce_eager,
            max_seq_len_to_capture=max_seq_len_to_capture,
            disable_custom_all_reduce=disable_custom_all_reduce,
-            disable_async_output_proc=disable_async_output_proc,
            hf_token=hf_token,
            hf_overrides=hf_overrides,
            mm_processor_kwargs=mm_processor_kwargs,
@ -309,11 +304,7 @@ class LLM:
        self.request_counter = Counter()
        self.default_sampling_params: Union[dict[str, Any], None] = None

-        if envs.VLLM_USE_V1:
-            supported_tasks = self.llm_engine \
-                .get_supported_tasks()  # type: ignore
-        else:
-            supported_tasks = self.llm_engine.model_config.supported_tasks
+        supported_tasks = self.llm_engine.get_supported_tasks()  # type: ignore

        logger.info("Supported_tasks: %s", supported_tasks)

@ -1473,8 +1464,6 @@ class LLM:
        Note:
            This method is only available with the V1 LLM engine.
        """
-        from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
-        assert isinstance(self.llm_engine, V1LLMEngine)
        return self.llm_engine.get_metrics()

    def _validate_and_add_requests(
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@ -15,10 +15,10 @@ from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest, PoolerOutput
 from vllm.tasks import SupportedTask
 from vllm.utils import make_async
+from vllm.v1.outputs import SamplerOutput
 from vllm.worker.worker_base import WorkerBase

 logger = init_logger(__name__)
--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@ -1,244 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import os
-from typing import Any, Callable, List, Optional, Union
-
-import cloudpickle
-
-from vllm.executor.executor_base import DistributedExecutorBase
-from vllm.executor.multiproc_worker_utils import (
-    ProcessWorkerWrapper, ResultHandler, WorkerMonitor,
-    set_multiprocessing_worker_envs)
-from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
-                        get_distributed_init_method, get_ip, get_open_port,
-                        make_async, run_method, update_environment_variables)
-from vllm.worker.worker_base import WorkerWrapperBase
-
-logger = init_logger(__name__)
-
-
-class MultiprocessingDistributedExecutor(DistributedExecutorBase):
-    """Python multiprocessing-based distributed executor"""
-
-    uses_ray: bool = False
-
-    def _check_cuda(self) -> None:
-        """Check that the number of GPUs is sufficient for the parallel
-        configuration. Separate from _init_executor to reduce the number of
-        indented blocks.
-        """
-        parallel_config = self.parallel_config
-        world_size = parallel_config.world_size
-        tensor_parallel_size = parallel_config.tensor_parallel_size
-
-        cuda_device_count = cuda_device_count_stateless()
-        # Use confusing message for more common TP-only case.
-        if tensor_parallel_size > cuda_device_count:
-            raise RuntimeError(
-                f"please set tensor_parallel_size ({tensor_parallel_size}) "
-                f"to less than max local gpu count ({cuda_device_count})")
-
-        if world_size > cuda_device_count:
-            raise RuntimeError(
-                f"please ensure that world_size ({world_size}) "
-                f"is less than than max local gpu count ({cuda_device_count})")
-
-        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
-        if "CUDA_VISIBLE_DEVICES" not in os.environ:
-            update_environment_variables({
-                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
-            })
-
-    def _init_executor(self) -> None:
-
-        from vllm.platforms import current_platform
-        if current_platform.is_cuda_alike():
-            self._check_cuda()
-
-        # Create the parallel GPU workers.
-        world_size = self.parallel_config.world_size
-        tensor_parallel_size = self.parallel_config.tensor_parallel_size
-
-        # Set multiprocessing envs that are common to V0 and V1
-        set_multiprocessing_worker_envs(self.parallel_config)
-
-        # Multiprocessing-based executor does not support multi-node setting.
-        # Since it only works for single node, we can use the loopback address
-        # 127.0.0.1 for communication.
-        distributed_init_method = get_distributed_init_method(
-            "127.0.0.1", get_open_port())
-
-        self.workers: List[ProcessWorkerWrapper] = []
-        # This is the list of workers that are rank 0 of each TP group EXCEPT
-        # global rank 0. These are the workers that will broadcast to the
-        # rest of the workers.
-        self.tp_driver_workers: List[ProcessWorkerWrapper] = []
-        # This is the list of workers that are not drivers and not the first
-        # worker in a TP group. These are the workers that will be
-        # broadcasted to.
-        self.non_driver_workers: List[ProcessWorkerWrapper] = []
-
-        if world_size == 1:
-            self.worker_monitor = None
-        else:
-            result_handler = ResultHandler()
-            for rank in range(1, world_size):
-                worker = ProcessWorkerWrapper(result_handler,
-                                              WorkerWrapperBase,
-                                              self.vllm_config, rank)
-                self.workers.append(worker)
-                if rank % tensor_parallel_size == 0:
-                    self.tp_driver_workers.append(worker)
-                else:
-                    self.non_driver_workers.append(worker)
-
-            self.worker_monitor = WorkerMonitor(self.workers, result_handler)
-            result_handler.start()
-            self.worker_monitor.start()
-
-        # Set up signal handlers to shut down the executor cleanly
-        # sometimes gc does not work well
-
-        self.driver_worker = WorkerWrapperBase(self.vllm_config, 0)
-
-        all_kwargs = []
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-        for i in range(world_size):
-            local_rank = i
-            rank = i
-            kwargs = dict(
-                vllm_config=self.vllm_config,
-                local_rank=local_rank,
-                rank=rank,
-                distributed_init_method=distributed_init_method,
-                is_driver_worker=(not self.parallel_config)
-                or (rank % self.parallel_config.tensor_parallel_size == 0),
-            )
-            all_kwargs.append(kwargs)
-        self._run_workers("init_worker", all_kwargs)
-        self._run_workers("init_device")
-        self._run_workers("load_model",
-                          max_concurrent_workers=self.parallel_config.
-                          max_parallel_loading_workers)
-        self.driver_exec_model = make_async(self.driver_worker.execute_model)
-        self.pp_locks: Optional[List[asyncio.Lock]] = None
-
-    def shutdown(self):
-        if (worker_monitor := getattr(self, "worker_monitor",
-                                      None)) is not None:
-            worker_monitor.close()
-
-    def _driver_execute_model(
-        self, execute_model_req: Optional[ExecuteModelRequest]
-    ) -> Optional[List[SamplerOutput]]:
-        """Run execute_model in the driver worker.
-
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
-        """
-        return self.driver_worker.execute_model(execute_model_req)
-
-    def _run_workers(
-        self,
-        method: Union[str, Callable],
-        *args,
-        async_run_tensor_parallel_workers_only: bool = False,
-        max_concurrent_workers: Optional[int] = None,
-        **kwargs,
-    ) -> List[Any]:
-        """Runs the given method on all workers.
-
-        Args:
-            async_run_tensor_parallel_workers_only: If True the method will be
-                run only in the remote TP workers, not the driver worker.
-                It will also be run asynchronously and return a list of futures
-                rather than blocking on the results.
-        """
-        if isinstance(method, str):
-            sent_method = method
-        else:
-            sent_method = cloudpickle.dumps(method)
-        del method
-
-        if max_concurrent_workers:
-            raise NotImplementedError(
-                "max_concurrent_workers is not supported yet.")
-
-        if async_run_tensor_parallel_workers_only:
-            # Run only non-driver workers and just return futures.
-            return [
-                worker.execute_method(sent_method, *args, **kwargs)
-                for worker in self.non_driver_workers
-            ]
-
-        # Start all remote workers first.
-        worker_outputs = [
-            worker.execute_method(sent_method, *args, **kwargs)
-            for worker in self.workers
-        ]
-
-        driver_worker_output = run_method(self.driver_worker, sent_method,
-                                          args, kwargs)
-
-        # Get the results of the workers.
-        return [driver_worker_output
-                ] + [output.get() for output in worker_outputs]
-
-    def check_health(self) -> None:
-        """Raises an error if engine is unhealthy."""
-        if self.worker_monitor is not None and not self.worker_monitor.is_alive(
-        ):
-            raise RuntimeError("Worker processes are not running")
-
-    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
-        """Wait for futures returned from _run_workers() with
-        async_run_remote_workers_only to complete."""
-        for result in parallel_worker_tasks:
-            result.get()
-
-    async def _driver_execute_model_async(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        if not self.tp_driver_workers:
-            return await self.driver_exec_model(execute_model_req)
-
-        if self.pp_locks is None:
-            # This locks each pipeline parallel stage so multiple virtual
-            # engines can't execute on the same stage at the same time
-            # We create the locks here to avoid creating them in the constructor
-            # which uses a different asyncio loop.
-            self.pp_locks = [
-                asyncio.Lock()
-                for _ in range(self.parallel_config.pipeline_parallel_size)
-            ]
-
-        tasks = [
-            asyncio.create_task(
-                _run_task_with_lock(self.driver_exec_model, self.pp_locks[0],
-                                    execute_model_req))
-        ]
-        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
-                                                start=1):
-            tasks.append(
-                asyncio.create_task(
-                    _run_task_with_lock(driver_worker.execute_method_async,
-                                        self.pp_locks[pp_rank],
-                                        "execute_model", execute_model_req)))
-        results = await asyncio.gather(*tasks)
-
-        # Only the last PP stage has the final results.
-        return results[-1]
-
-    async def _start_worker_execution_loop(self):
-        coros = [
-            worker.execute_method_async("start_worker_execution_loop")
-            for worker in self.non_driver_workers
-        ]
-        return await asyncio.gather(*coros)
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@ -1,279 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import os
-import threading
-import uuid
-from dataclasses import dataclass
-from multiprocessing import Queue
-from multiprocessing.connection import wait
-from multiprocessing.process import BaseProcess
-from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.utils import (_maybe_force_spawn, decorate_logs, get_mp_context,
-                        run_method)
-
-logger = init_logger(__name__)
-
-T = TypeVar('T')
-
-_TERMINATE = "TERMINATE"  # sentinel
-
-JOIN_TIMEOUT_S = 2
-
-
-@dataclass
-class Result(Generic[T]):
-    """Result of task dispatched to worker"""
-
-    task_id: uuid.UUID
-    value: Optional[T] = None
-    exception: Optional[BaseException] = None
-
-
-class ResultFuture(threading.Event, Generic[T]):
-    """Synchronous future for non-async case"""
-
-    def __init__(self):
-        super().__init__()
-        self.result: Optional[Result[T]] = None
-
-    def set_result(self, result: Result[T]):
-        self.result = result
-        self.set()
-
-    def get(self) -> T:
-        self.wait()
-        assert self.result is not None
-        if self.result.exception is not None:
-            raise self.result.exception
-        return self.result.value  # type: ignore[return-value]
-
-
-def _set_future_result(future: Union[ResultFuture, asyncio.Future],
-                       result: Result):
-    if isinstance(future, ResultFuture):
-        future.set_result(result)
-        return
-    loop = future.get_loop()
-    if not loop.is_closed():
-        if result.exception is not None:
-            loop.call_soon_threadsafe(future.set_exception, result.exception)
-        else:
-            loop.call_soon_threadsafe(future.set_result, result.value)
-
-
-class ResultHandler(threading.Thread):
-    """Handle results from all workers (in background thread)"""
-
-    def __init__(self) -> None:
-        super().__init__(daemon=True)
-        self.result_queue = get_mp_context().Queue()
-        self.tasks: Dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {}
-
-    def run(self):
-        for result in iter(self.result_queue.get, _TERMINATE):
-            future = self.tasks.pop(result.task_id)
-            _set_future_result(future, result)
-        # Ensure that all waiters will receive an exception
-        for task_id, future in self.tasks.items():
-            _set_future_result(
-                future,
-                Result(task_id=task_id,
-                       exception=ChildProcessError("worker died")))
-
-    def close(self):
-        self.result_queue.put(_TERMINATE)
-
-
-class WorkerMonitor(threading.Thread):
-    """Monitor worker status (in background thread)"""
-
-    def __init__(self, workers: List['ProcessWorkerWrapper'],
-                 result_handler: ResultHandler):
-        super().__init__(daemon=True)
-        self.workers = workers
-        self.result_handler = result_handler
-        self._close = False
-
-    def run(self) -> None:
-        # Blocks until any worker exits
-        dead_sentinels = wait([w.process.sentinel for w in self.workers])
-        if not self._close:
-            self._close = True
-
-            # Kill / cleanup all workers
-            for worker in self.workers:
-                process = worker.process
-                if process.sentinel in dead_sentinels:
-                    process.join(JOIN_TIMEOUT_S)
-                if process.exitcode is not None and process.exitcode != 0:
-                    logger.error("Worker %s pid %s died, exit code: %s",
-                                 process.name, process.pid, process.exitcode)
-            # Cleanup any remaining workers
-            if logger:
-                logger.info("Killing local vLLM worker processes")
-            for worker in self.workers:
-                worker.kill_worker()
-            # Must be done after worker task queues are all closed
-            self.result_handler.close()
-
-        for worker in self.workers:
-            worker.process.join(JOIN_TIMEOUT_S)
-
-    def close(self):
-        if self._close:
-            return
-        self._close = True
-        logger.info("Terminating local vLLM worker processes")
-        for worker in self.workers:
-            worker.terminate_worker()
-        # Must be done after worker task queues are all closed
-        self.result_handler.close()
-
-
-class ProcessWorkerWrapper:
-    """Local process wrapper for vllm.worker.Worker,
-    for handling single-node multi-GPU tensor parallel."""
-
-    def __init__(self, result_handler: ResultHandler,
-                 worker_factory: Callable[[VllmConfig, int], Any],
-                 vllm_config: VllmConfig, rank: int) -> None:
-        self.mp = get_mp_context()
-        self._task_queue = self.mp.Queue()
-        self.result_queue = result_handler.result_queue
-        self.tasks = result_handler.tasks
-        self.process: BaseProcess = self.mp.Process(  # type: ignore[attr-defined]
-            target=_run_worker_process,
-            name="VllmWorkerProcess",
-            kwargs=dict(
-                worker_factory=worker_factory,
-                task_queue=self._task_queue,
-                result_queue=self.result_queue,
-                vllm_config=vllm_config,
-                rank=rank,
-            ),
-            daemon=True)
-
-        self.process.start()
-
-    def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future],
-                      method: Union[str, bytes], args, kwargs):
-        task_id = uuid.uuid4()
-        self.tasks[task_id] = future
-        try:
-            self._task_queue.put((task_id, method, args, kwargs))
-        except SystemExit:
-            raise
-        except BaseException as e:
-            del self.tasks[task_id]
-            raise ChildProcessError("worker died") from e
-
-    def execute_method(self, method: Union[str, bytes], *args, **kwargs):
-        future: ResultFuture = ResultFuture()
-        self._enqueue_task(future, method, args, kwargs)
-        return future
-
-    async def execute_method_async(self, method: Union[str, bytes], *args,
-                                   **kwargs):
-        future = asyncio.get_running_loop().create_future()
-        self._enqueue_task(future, method, args, kwargs)
-        return await future
-
-    def terminate_worker(self):
-        try:
-            self._task_queue.put(_TERMINATE)
-        except ValueError:
-            self.process.kill()
-        self._task_queue.close()
-
-    def kill_worker(self):
-        self._task_queue.close()
-        self.process.kill()
-
-
-def _run_worker_process(
-    worker_factory: Callable[[VllmConfig, int], Any],
-    task_queue: Queue,
-    result_queue: Queue,
-    vllm_config: VllmConfig,
-    rank: int,
-) -> None:
-    """Worker process event loop"""
-
-    # Add process-specific prefix to stdout and stderr
-    process_name = get_mp_context().current_process().name
-    decorate_logs(process_name)
-
-    # Initialize worker
-    worker = worker_factory(vllm_config, rank)
-    del worker_factory
-
-    # Accept tasks from the engine in task_queue
-    # and return task output in result_queue
-    logger.info("Worker ready; awaiting tasks")
-    try:
-        for items in iter(task_queue.get, _TERMINATE):
-            output = None
-            exception = None
-            task_id, method, args, kwargs = items
-            try:
-                output = run_method(worker, method, args, kwargs)
-            except SystemExit:
-                raise
-            except KeyboardInterrupt:
-                break
-            except BaseException as e:
-                logger.exception(
-                    "Exception in worker %s while processing method %s.",
-                    process_name, method)
-                exception = e
-            result_queue.put(
-                Result(task_id=task_id, value=output, exception=exception))
-    except KeyboardInterrupt:
-        pass
-    except Exception:
-        logger.exception("Worker failed")
-
-    # Flush TunableOp results when TunableOp is enabled and
-    # online (in situ) tuning is enabled.
-    # Offline tuning API (record_untuned_is_enabled()) only
-    # available in PyTorch 2.6 or later.
-    if torch.cuda.is_available():
-        import torch.cuda.tunable as tunable
-        if (tunable.is_enabled() and tunable.tuning_is_enabled()
-                and not tunable.record_untuned_is_enabled()):
-            tunable.write_file()
-
-    logger.info("Worker exiting")
-
-
-def set_multiprocessing_worker_envs(parallel_config):
-    """ Set up environment variables that should be used when there are workers
-    in a multiprocessing environment. This should be called by the parent 
-    process before worker processes are created"""
-
-    _maybe_force_spawn()
-
-    # Configure thread parallelism if OMP_NUM_THREADS isn't set
-    #
-    # Helps to avoid CPU contention. The default of spawning a thread per
-    # core combined with multiprocessing for each GPU can have a negative
-    # impact on performance. The contention is amplified when running in a
-    # container where CPU limits can cause throttling.
-    default_omp_num_threads = 1
-    if "OMP_NUM_THREADS" not in os.environ and (
-            current_parallelism :=
-            torch.get_num_threads()) > default_omp_num_threads:
-        logger.warning(
-            "Reducing Torch parallelism from %d threads to %d to avoid "
-            "unnecessary CPU contention. Set OMP_NUM_THREADS in the "
-            "external environment to tune this value as needed.",
-            current_parallelism, default_omp_num_threads)
-        os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
-        torch.set_num_threads(default_omp_num_threads)
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@ -17,12 +17,12 @@ from vllm.executor.msgspec_utils import encode_hook
 from vllm.executor.ray_utils import (RayWorkerWrapper, initialize_ray_cluster,
                                     ray)
 from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.platforms import current_platform
 from vllm.ray.ray_env import get_env_vars_to_copy
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
                        get_ip, get_open_port, make_async)
+from vllm.v1.outputs import SamplerOutput

 if ray is not None:
    from ray.actor import ActorHandle
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@ -137,10 +137,6 @@ class ExecutorWithExternalLauncher(UniProcExecutor):
    def _init_executor(self) -> None:
        """Initialize the worker and load the model.
        """
-        assert self.vllm_config.scheduler_config.delay_factor == 0.0, \
-            ("ExecutorWithExternalLauncher needs deterministic "
-            "execution, so it"
-            "does not support delay_factor in scheduling")
        if envs.VLLM_USE_V1:
            assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, \
            ("To get deterministic execution in V1, "
--- a/vllm/inputs/init.py
+++ b/vllm/inputs/init.py
@ -7,15 +7,7 @@ from .data import (DataPrompt, DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt,
                   SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt,
                   build_explicit_enc_dec_prompt, embeds_inputs,
                   to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts)
-from .registry import (DummyData, InputContext, InputProcessingContext,
-                       InputRegistry)
-
-INPUT_REGISTRY = InputRegistry()
-"""
-The global [`InputRegistry`][vllm.inputs.registry.InputRegistry] which is used
-by [`LLMEngine`][vllm.LLMEngine] to dispatch data processing according to the
-target model.
-"""
+from .registry import InputContext, InputProcessingContext

 __all__ = [
    "DataPrompt",
@ -36,9 +28,6 @@ __all__ = [
    "build_explicit_enc_dec_prompt",
    "to_enc_dec_tuple_list",
    "zip_enc_dec_prompts",
-    "INPUT_REGISTRY",
-    "DummyData",
    "InputContext",
    "InputProcessingContext",
-    "InputRegistry",
 ]
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
+from typing import TYPE_CHECKING, Any, Union

 import torch
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
@ -15,16 +15,9 @@ from vllm.utils.jsontree import JSONTree, json_map_leaves

 if TYPE_CHECKING:
    from vllm.config import ModelConfig
-    from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict,
-                                 MultiModalRegistry)
-    from vllm.sequence import SequenceData
    from vllm.transformers_utils.tokenizer import AnyTokenizer
 else:
    ModelConfig = Any
-    MultiModalDataDict = Any
-    MultiModalPlaceholderDict = Any
-    MultiModalRegistry = Any
-    SequenceData = Any
    AnyTokenizer = Any

 _T = TypeVar("_T")
@ -191,61 +184,3 @@ class InputProcessingContext(InputContext):
                   f"on data={data} with kwargs={allowed_kwargs}")

            raise ValueError(msg) from exc
-
-
-class DummyData(NamedTuple):
-    """
-    Dummy data used for profiling.
-
-    Note: This is only used in V0.
-    """
-
-    seq_data: SequenceData
-    multi_modal_data: Optional[MultiModalDataDict] = None
-    multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
-
-
-class InputRegistry:
-    """
-    Note: This is only used in V0.
-    """
-
-    def dummy_data_for_profiling(
-        self,
-        model_config: ModelConfig,
-        seq_len: int,
-        mm_registry: MultiModalRegistry,
-        is_encoder_data: bool = False,
-    ) -> DummyData:
-        """
-        Create dummy data for profiling the memory usage of a model.
-
-        The model is identified by ``model_config``.
-        """
-        # Avoid circular import
-        from vllm.multimodal.cache import processor_only_cache_from_config
-        from vllm.sequence import SequenceData
-
-        if not model_config.is_multimodal_model:
-            seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
-            return DummyData(seq_data=seq_data)
-
-        cache = processor_only_cache_from_config(model_config, mm_registry)
-
-        # Encoder dummy data does not contain multi-modal data
-        if is_encoder_data:
-            enc_data = mm_registry.get_encoder_dummy_data(model_config,
-                                                          seq_len,
-                                                          cache=cache)
-            seq_data = SequenceData.from_seqs(enc_data.prompt_token_ids)
-            return DummyData(seq_data=seq_data)
-
-        dec_data = mm_registry.get_decoder_dummy_data(model_config,
-                                                      seq_len,
-                                                      cache=cache)
-
-        return DummyData(
-            seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids),
-            multi_modal_data=dec_data.multi_modal_data.get_data(),
-            multi_modal_placeholders=dec_data.multi_modal_placeholders,
-        )
--- a/vllm/logging_utils/init.py
+++ b/vllm/logging_utils/init.py
@ -2,7 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from vllm.logging_utils.formatter import NewLineFormatter
+from vllm.logging_utils.log_time import logtime

 __all__ = [
    "NewLineFormatter",
+    "logtime",
 ]
--- a/vllm/logging_utils/log_time.py
+++ b/vllm/logging_utils/log_time.py
@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Provides a timeslice logging decorator
+"""
+
+import functools
+import time
+
+
+def logtime(logger, msg=None):
+    """
+    Logs the execution time of the decorated function.
+    Always place it beneath other decorators.
+    """
+
+    def _inner(func):
+
+        @functools.wraps(func)
+        def _wrapper(*args, **kwargs):
+            start = time.perf_counter()
+            result = func(*args, **kwargs)
+            elapsed = time.perf_counter() - start
+
+            prefix = f"Function '{func.__module__}.{func.__qualname__}'" \
+                if msg is None else msg
+            logger.debug("%s: Elapsed time %.7f secs", prefix, elapsed)
+            return result
+
+        return _wrapper
+
+    return _inner
--- a/vllm/model_executor/init.py
+++ b/vllm/model_executor/init.py
@ -3,13 +3,9 @@

 from vllm.model_executor.parameter import (BasevLLMParameter,
                                           PackedvLLMParameter)
-from vllm.model_executor.sampling_metadata import (SamplingMetadata,
-                                                   SamplingMetadataCache)
 from vllm.model_executor.utils import set_random_seed

 __all__ = [
-    "SamplingMetadata",
-    "SamplingMetadataCache",
    "set_random_seed",
    "BasevLLMParameter",
    "PackedvLLMParameter",
--- a/Show More
+++ b/Show More