[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-03-16 15:47:22 +08:00 · 2025-10-07 23:42:31 +08:00 · 2025-10-07 23:42:31 +08:00 · 1e4ecca1d0
commit 1e4ecca1d0
parent c0a7b89d8e
51 changed files with 817 additions and 1275 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -296,6 +296,7 @@ steps:
    - tests/v1
  commands:
    # split the test to avoid interference
+    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
    - pytest -v -s v1/kv_offload
    - pytest -v -s v1/sample
@ -317,7 +318,7 @@ steps:
  no_gpu: true
  commands:
    # split the test to avoid interference
-    - pytest -v -s v1/core
+    - pytest -v -s -m 'cpu_test' v1/core
    - pytest -v -s v1/structured_output
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -13,7 +13,7 @@ import pytest
 import torch

 from vllm import LLM
-from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
+from vllm.v1.engine.llm_engine import LLMEngine

 from ..conftest import HfRunner, VllmRunner
 from ..models.utils import check_outputs_equal
@ -211,16 +211,11 @@ def test_models_distributed(


 def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
-    from vllm.envs import VLLM_USE_V1
-
-    if not VLLM_USE_V1:
-        pytest.skip("Skipping V0 test, dump input not supported")
-
    # Needed to mock an error in the same process
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

    with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model:
-        if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
+        if isinstance(vllm_model.llm.llm_engine, LLMEngine):
            v1_test_failed_model_execution(vllm_model)


--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@ -117,68 +117,59 @@ def test_cumem_with_cudagraph():

@create_new_process_for_each_test()
@pytest.mark.parametrize(
-    "model, use_v1",
+    "model",
    [
        # sleep mode with safetensors
-        ("meta-llama/Llama-3.2-1B", True),
+        "meta-llama/Llama-3.2-1B",
        # sleep mode with pytorch checkpoint
-        ("facebook/opt-125m", True),
+        "facebook/opt-125m",
    ],
 )
-def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
-    with monkeypatch.context() as m:
-        assert use_v1
-        m.setenv("VLLM_USE_V1", "1")
-        free, total = torch.cuda.mem_get_info()
-        used_bytes_baseline = total - free  # in case other process is running
-        llm = LLM(model, enable_sleep_mode=True)
-        prompt = "How are you?"
-        sampling_params = SamplingParams(temperature=0, max_tokens=10)
-        output = llm.generate(prompt, sampling_params)
+def test_end_to_end(model: str):
+    free, total = torch.cuda.mem_get_info()
+    used_bytes_baseline = total - free  # in case other process is running
+    llm = LLM(model, enable_sleep_mode=True)
+    prompt = "How are you?"
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    output = llm.generate(prompt, sampling_params)

-        # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
-        # which is difficult to measure in the test. therefore, we only
-        # test sleep level 1 here.
-        llm.sleep(level=1)
+    # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
+    # which is difficult to measure in the test. therefore, we only
+    # test sleep level 1 here.
+    llm.sleep(level=1)

-        free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
-        used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
-        # now the memory usage is mostly cudagraph memory pool,
-        # and it should be less than the model weights (1B model, 2GiB weights)
+    free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+    # now the memory usage is mostly cudagraph memory pool,
+    # and it should be less than the model weights (1B model, 2GiB weights)

-        # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
-        # is captured but cannot be releasesd from PyTorch due to a known bug,
-        # therefore high memory usage after `llm.sleep` is called is expected.
-        # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
-        # in V1.
-        if use_v1:
-            assert used_bytes < 7 * GiB_bytes
-        else:
-            assert used_bytes < 2 * GiB_bytes
+    # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
+    # is captured but cannot be releasesd from PyTorch due to a known bug,
+    # therefore high memory usage after `llm.sleep` is called is expected.
+    # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
+    # in V1.
+    assert used_bytes < 7 * GiB_bytes

-        llm.wake_up()
-        output2 = llm.generate(prompt, sampling_params)
-        # cmp output
-        assert output[0].outputs[0].text == output2[0].outputs[0].text
+    llm.wake_up()
+    output2 = llm.generate(prompt, sampling_params)
+    # cmp output
+    assert output[0].outputs[0].text == output2[0].outputs[0].text

-        llm.sleep(level=1)
-        llm.wake_up(tags=["weights"])
+    llm.sleep(level=1)
+    llm.wake_up(tags=["weights"])

-        free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
-        used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
+    free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
+    used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline

-        # should just reallocate memory for weights (1B model, ~2GiB weights)
-        if use_v1:
-            assert used_bytes < 10 * GiB_bytes
-        else:
-            assert used_bytes < 6 * GiB_bytes
+    # should just reallocate memory for weights (1B model, ~2GiB weights)
+    assert used_bytes < 10 * GiB_bytes

-        # now allocate kv cache memory
-        llm.wake_up(tags=["kv_cache"])
-        output3 = llm.generate(prompt, sampling_params)
+    # now allocate kv cache memory
+    llm.wake_up(tags=["kv_cache"])
+    output3 = llm.generate(prompt, sampling_params)

-        # cmp output
-        assert output[0].outputs[0].text == output3[0].outputs[0].text
+    # cmp output
+    assert output[0].outputs[0].text == output3[0].outputs[0].text


@create_new_process_for_each_test()
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@ -66,7 +66,6 @@ def llm_pair(request):
            pytest.skip("Only Blackwell GPUs support Cutlass MLA")

    env_vars = {
-        "VLLM_USE_V1": "1",
        # Force native sampler to avoid potential nondeterminism in FlashInfer
        # when per-request generators are not used in V1.
        "VLLM_USE_FLASHINFER_SAMPLER": "0",
@ -161,7 +160,6 @@ def test_full_cudagraph_with_invalid_backend():
    with (
        temporary_environ(
            {
-                "VLLM_USE_V1": "1",
                "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
                # Flex_Attention is not supported with full cuda graph
            }
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@ -18,7 +18,6 @@ from vllm.config import (
    VllmConfig,
    set_current_vllm_config,
 )
-from vllm.envs import VLLM_USE_V1
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils import is_torch_equal_or_newer

@ -127,7 +126,6 @@ def _run_simple_model(
@pytest.mark.parametrize("use_inductor", [True, False])
@torch.inference_mode()
 def test_simple_piecewise_compile(use_inductor):
-    assert VLLM_USE_V1
    _run_simple_model(
        splitting_ops=["silly.attention"],
        use_inductor_graph_partition=False,
@ -146,7 +144,6 @@ def test_simple_piecewise_compile(use_inductor):
@torch.inference_mode()
@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
 def test_simple_inductor_graph_partition(splitting_ops):
-    assert VLLM_USE_V1
    if not is_torch_equal_or_newer("2.9.0.dev"):
        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@ -388,10 +388,6 @@ def test_async_tp_pass_correctness(
        "pass_config": {"enable_async_tp": async_tp_enabled},
    }

-    async_tp_env = tp_env = {
-        "VLLM_USE_V1": "1",
-    }
-
    async_tp_args = [
        *common_args,
        "--tensor-parallel-size",
@ -410,6 +406,4 @@ def test_async_tp_pass_correctness(
        "mp",
    ]

-    compare_two_settings(
-        model_id, async_tp_args, tp_args, async_tp_env, tp_env, method="generate"
-    )
+    compare_two_settings(model_id, async_tp_args, tp_args, method="generate")
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest

-import vllm
 from vllm.compilation.counter import compilation_counter
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.utils import _is_torch_equal_or_newer
@ -16,15 +15,10 @@ def test_version():
    assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")


-def test_use_cudagraphs_dynamic(monkeypatch):
-    assert vllm.envs.VLLM_USE_V1
+def test_use_cudagraphs_dynamic():
    vllm_config = VllmConfig()
    assert vllm_config.compilation_config.use_cudagraph

-    monkeypatch.setenv("VLLM_USE_V1", "0")
-    vllm_config = VllmConfig()
-    assert not vllm_config.compilation_config.use_cudagraph
-

 def test_custom_op():
    # proper syntax
@ -41,8 +35,6 @@ def test_custom_op():
 # may be influenced by other tests.
@pytest.mark.parametrize("val", ["1"])
 def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
-    assert vllm.envs.VLLM_USE_V1
-
    # Disable multiprocessing so that the counter is in the same process
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
@ -68,8 +60,6 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
@pytest.mark.forked
@pytest.mark.parametrize("enabled", [True, False])
 def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
-    assert vllm.envs.VLLM_USE_V1
-
    # Disable multiprocessing so that the counter is in the same process
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@ -303,7 +303,6 @@ def test_attention_quant_pattern(
    model_class: type[AttentionQuantPatternModel],
    backend: _Backend,
    use_inductor_graph_partition: bool,
-    monkeypatch,
    dist_init,
    caplog_vllm,
 ):
@ -312,8 +311,6 @@ def test_attention_quant_pattern(
    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
    device = torch.device("cuda:0")
    torch.manual_seed(42)

--- a/tests/config/test_mp_reducer.py
+++ b/tests/config/test_mp_reducer.py
@ -8,16 +8,13 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.v1.engine.async_llm import AsyncLLM


-def test_mp_reducer(monkeypatch):
+def test_mp_reducer():
    """
    Test that _reduce_config reducer is registered when AsyncLLM is instantiated
    without transformers_modules. This is a regression test for
    https://github.com/vllm-project/vllm/pull/18640.
    """

-    # Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
    # Ensure transformers_modules is not in sys.modules
    if "transformers_modules" in sys.modules:
        del sys.modules["transformers_modules"]
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@ -5,7 +5,7 @@ from typing import Any, Optional

 import pytest

-from vllm import LLM, SamplingParams, envs
+from vllm import LLM, SamplingParams

 MODEL = "meta-llama/llama-2-7b-hf"
 MAX_TOKENS = 200
@ -111,9 +111,7 @@ def _stop_token_id(llm):

@pytest.mark.skip_global_cleanup
 def test_stop_strings():
-    # If V0, must set enforce_eager=False since we use
-    # async output processing below.
-    llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
+    llm = LLM(MODEL, enforce_eager=True)

    _stop_basic(llm)
    _stop_multi_tokens(llm)
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@ -42,24 +42,10 @@ class CPTestOptions(NamedTuple):
@dataclass
 class CPTestSettings:
    parallel_setups: list[ParallelSetup]
-    # NOTE: the length of distributed_backends and
-    # vllm_major_versions should be the same, and they
-    # are first zipped together to iterate over all
-    # test settings.
    distributed_backends: list[str]
-    # vllm major version: "0" for V0, "1" for V1
-    vllm_major_versions: list[str]
    runner: RunnerOption
    test_options: CPTestOptions

-    def __post_init__(self):
-        if len(self.distributed_backends) != len(self.vllm_major_versions):
-            raise ValueError(
-                f"Length mismatch: distributed_backends "
-                f"({len(self.distributed_backends)}) != "
-                f"vllm_major_versions ({len(self.vllm_major_versions)})"
-            )
-
    @staticmethod
    def detailed(
        *,
@ -87,7 +73,6 @@ class CPTestSettings:
        return CPTestSettings(
            parallel_setups=parallel_setups,
            distributed_backends=["mp"],
-            vllm_major_versions=["1"],
            runner=runner,
            test_options=CPTestOptions(
                multi_node_only=multi_node_only, load_format=load_format
@ -98,14 +83,11 @@ class CPTestSettings:
        opts = self.test_options

        for parallel_setup in self.parallel_setups:
-            for backend, vllm_major_version in zip(
-                self.distributed_backends, self.vllm_major_versions
-            ):
+            for backend in self.distributed_backends:
                yield (
                    model_id,
                    parallel_setup,
                    backend,
-                    vllm_major_version,
                    self.runner,
                    opts,
                )
@ -115,7 +97,6 @@ def _compare_cp_with_tp(
    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
-    vllm_major_version: str,
    runner: RunnerOption,
    test_options: CPTestOptions,
    num_gpus_available: int,
@ -191,10 +172,6 @@ def _compare_cp_with_tp(
    if hf_overrides:
        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])

-    cp_env = tp_env = {
-        "VLLM_USE_V1": vllm_major_version,  # Note(hc): DCP only support V1 engine only
-    }
-
    cp_args = [
        *common_args,
        "--tensor-parallel-size",
@ -217,24 +194,13 @@ def _compare_cp_with_tp(
        distributed_backend,
    ]

-    try:
-        compare_two_settings(
-            model_id,
-            cp_args,
-            tp_args,
-            cp_env,
-            tp_env,
-            method=method,
-            max_wait_seconds=720,
-        )
-    except Exception:
-        testing_ray_compiled_graph = cp_env is not None
-        if testing_ray_compiled_graph and vllm_major_version == "0":
-            # Ray Compiled Graph tests are flaky for V0,
-            # so we don't want to fail the test
-            logger.exception("Ray Compiled Graph tests failed")
-        else:
-            raise
+    compare_two_settings(
+        model_id,
+        cp_args,
+        tp_args,
+        method=method,
+        max_wait_seconds=720,
+    )


 CP_TEXT_GENERATION_MODELS = {
@ -257,7 +223,6 @@ CP_TEST_MODELS = [
        "model_id",
        "parallel_setup",
        "distributed_backend",
-        "vllm_major_version",
        "runner",
        "test_options",
    ),
@ -274,7 +239,6 @@ def test_cp_generation(
    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
-    vllm_major_version: str,
    runner: RunnerOption,
    test_options: CPTestOptions,
    num_gpus_available,
@ -283,7 +247,6 @@ def test_cp_generation(
        model_id,
        parallel_setup,
        distributed_backend,
-        vllm_major_version,
        runner,
        test_options,
        num_gpus_available,
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@ -307,7 +307,6 @@ def _compare_tp(
    if distributed_backend == "ray":
        # For V1, test Ray Compiled Graph for all the tests
        pp_env = {
-            "VLLM_USE_V1": "1",
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
@ -316,15 +315,11 @@ def _compare_tp(
        # terminate because of a Ray Compiled Graph issue.
        common_args.append("--disable-frontend-multiprocessing")
    elif distributed_backend == "mp":
-        pp_env = {
-            "VLLM_USE_V1": "1",
-        }
+        pp_env = None
    else:
        pp_env = None

-    tp_env = {
-        "VLLM_USE_V1": "1",
-    }
+    tp_env = None

    pp_args = [
        *common_args,
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@ -42,24 +42,10 @@ class SPTestOptions(NamedTuple):
@dataclass
 class SPTestSettings:
    parallel_setups: list[ParallelSetup]
-    # NOTE: the length of distributed_backends and
-    # vllm_major_versions should be the same, and they
-    # are first zipped together to iterate over all
-    # test settings.
    distributed_backends: list[str]
-    # vllm major version: "0" for V0, "1" for V1
-    vllm_major_versions: list[str]
    runner: RunnerOption
    test_options: SPTestOptions

-    def __post_init__(self):
-        if len(self.distributed_backends) != len(self.vllm_major_versions):
-            raise ValueError(
-                f"Length mismatch: distributed_backends "
-                f"({len(self.distributed_backends)}) != "
-                f"vllm_major_versions ({len(self.vllm_major_versions)})"
-            )
-
    @staticmethod
    def detailed(
        *,
@ -85,7 +71,6 @@ class SPTestSettings:
        return SPTestSettings(
            parallel_setups=parallel_setups,
            distributed_backends=["mp", "ray"],
-            vllm_major_versions=["1", "1"],
            runner=runner,
            test_options=SPTestOptions(
                multi_node_only=multi_node_only, load_format=load_format
@ -117,7 +102,6 @@ class SPTestSettings:
        return SPTestSettings(
            parallel_setups=parallel_setups,
            distributed_backends=["mp", "ray"],
-            vllm_major_versions=["1", "1"],
            runner=runner,
            test_options=SPTestOptions(
                multi_node_only=multi_node_only, load_format=load_format
@ -147,7 +131,6 @@ class SPTestSettings:
        return SPTestSettings(
            parallel_setups=parallel_setups,
            distributed_backends=["mp", "ray"],
-            vllm_major_versions=["1", "1"],
            runner=runner,
            test_options=SPTestOptions(
                multi_node_only=multi_node_only, load_format=load_format
@ -158,14 +141,11 @@ class SPTestSettings:
        opts = self.test_options

        for parallel_setup in self.parallel_setups:
-            for backend, vllm_major_version in zip(
-                self.distributed_backends, self.vllm_major_versions
-            ):
+            for backend in self.distributed_backends:
                yield (
                    model_id,
                    parallel_setup,
                    backend,
-                    vllm_major_version,
                    self.runner,
                    opts,
                )
@ -175,7 +155,6 @@ def _compare_sp(
    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
-    vllm_major_version: str,
    runner: RunnerOption,
    test_options: SPTestOptions,
    num_gpus_available: int,
@ -265,10 +244,6 @@ def _compare_sp(
        },
    }

-    tp_sp_env = tp_env = {
-        "VLLM_USE_V1": vllm_major_version,
-    }
-
    tp_sp_args = [
        *common_args,
        "--tensor-parallel-size",
@ -281,9 +256,6 @@ def _compare_sp(
        json.dumps(compilation_config),
    ]

-    tp_env = {
-        "VLLM_USE_V1": vllm_major_version,
-    }
    tp_args = [
        *common_args,
        "--tensor-parallel-size",
@ -292,18 +264,7 @@ def _compare_sp(
        "mp",
    ]

-    try:
-        compare_two_settings(
-            model_id, tp_sp_args, tp_args, tp_sp_env, tp_env, method=method
-        )
-    except Exception:
-        testing_ray_compiled_graph = tp_sp_env is not None
-        if testing_ray_compiled_graph and vllm_major_version == "0":
-            # Ray Compiled Graph tests are flaky for V0,
-            # so we don't want to fail the test
-            logger.exception("Ray Compiled Graph tests failed")
-        else:
-            raise
+    compare_two_settings(model_id, tp_sp_args, tp_args, method=method)


 SP_TEXT_GENERATION_MODELS = {
@ -325,7 +286,6 @@ SP_TEST_MODELS = [
        "model_id",
        "parallel_setup",
        "distributed_backend",
-        "vllm_major_version",
        "runner",
        "test_options",
    ),
@ -341,7 +301,6 @@ def test_tp_sp_generation(
    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
-    vllm_major_version: str,
    runner: RunnerOption,
    test_options: SPTestOptions,
    num_gpus_available,
@ -350,7 +309,6 @@ def test_tp_sp_generation(
        model_id,
        parallel_setup,
        distributed_backend,
-        vllm_major_version,
        runner,
        test_options,
        num_gpus_available,
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@ -61,50 +61,34 @@ def run_test(model_name, more_args=None):
 TPU_TP_TEST_STR = ""  # "tensor_parallel_size=4"


-@pytest.mark.skipif(
-    not current_platform.is_cuda() and not current_platform.is_tpu(),
-    reason="V1 is currently only supported on CUDA and TPU",
-)
@pytest.mark.parametrize("model", MODEL_NAMES)
-def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
+def test_lm_eval_accuracy_v1_engine(model):
    """Run with the V1 Engine."""

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    more_args = None
+    if current_platform.is_tpu():
+        # Limit compilation time for TPU V1

-        more_args = None
-        if current_platform.is_tpu():
-            # Limit compilation time for TPU V1
+        more_args = "max_model_len=2048,max_num_seqs=64"

-            more_args = "max_model_len=2048,max_num_seqs=64"
+        # Add TP test (if provided)
+        if TPU_TP_TEST_STR:
+            more_args += ",{}".format(TPU_TP_TEST_STR)

-            # Add TP test (if provided)
-            if TPU_TP_TEST_STR:
-                more_args += ",{}".format(TPU_TP_TEST_STR)
-
-        run_test(model, more_args)
+    run_test(model, more_args)


-@pytest.mark.skipif(
-    not current_platform.is_cuda() and not current_platform.is_tpu(),
-    reason="V1 is currently only supported on CUDA and TPU",
-)
@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
-def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
-    model, monkeypatch: pytest.MonkeyPatch
-):
+def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(model):
    """Run with the V1 Engine."""

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    more_args = None
+    if current_platform.is_tpu():
+        # Limit compilation time for TPU V1
+        more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"

-        more_args = None
-        if current_platform.is_tpu():
-            # Limit compilation time for TPU V1
-            more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
+        # Add TP test (if provided)
+        if TPU_TP_TEST_STR:
+            more_args += ",{}".format(TPU_TP_TEST_STR)

-            # Add TP test (if provided)
-            if TPU_TP_TEST_STR:
-                more_args += ",{}".format(TPU_TP_TEST_STR)
-
-        run_test(model, more_args)
+    run_test(model, more_args)
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@ -10,7 +10,6 @@ AsyncLLMEngine are working correctly.
 """

 import lm_eval
-import pytest

 from vllm.platforms import current_platform

@ -67,21 +66,13 @@ def run_test(more_args):
        ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"


-@pytest.mark.skipif(
-    not current_platform.is_cuda()
-    and not current_platform.is_tpu()
-    and not current_platform.is_xpu(),
-    reason="V1 currently only supported on CUDA, XPU and TPU",
-)
-def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
+def test_lm_eval_accuracy_v1_engine():
    """Run with the V1 Engine."""

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        more_args = []
+    more_args = []

-        # Limit compilation time for V1
-        if current_platform.is_tpu():
-            more_args = ["--max-num-seqs", "64"]
+    # Limit compilation time for V1
+    if current_platform.is_tpu():
+        more_args = ["--max-num-seqs", "64"]

-        run_test(more_args)
+    run_test(more_args)
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@ -21,18 +21,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"


@pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
-@pytest.fixture(scope="module")
-def server(monkeypatch_module, zephyr_lora_files):  # noqa: F811
-    monkeypatch_module.setenv("VLLM_USE_V1", "1")
-
+def server(zephyr_lora_files):  # noqa: F811
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@ -37,21 +37,8 @@ BADREQUEST_CASES = [
 ]


-@pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
@pytest.fixture(scope="module", params=[True])
-def server_with_lora_modules_json(request, monkeypatch_module, zephyr_lora_files):
-    use_v1 = request.param
-    assert use_v1
-    monkeypatch_module.setenv("VLLM_USE_V1", "1")
-
+def server_with_lora_modules_json(request, zephyr_lora_files):
    # Define the json format LoRA module configurations
    lora_module_1 = {
        "name": "zephyr-lora",
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@ -22,24 +22,6 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 PREV_MINOR_VERSION = version._prev_minor_version()


-@pytest.fixture(scope="module", params=[True])
-def use_v1(request):
-    # Module-scoped variant of run_with_both_engines
-    #
-    # Use this fixture to run a test with both v0 and v1, and
-    # also to conditionalize the test logic e.g.
-    #
-    # def test_metrics_exist(use_v1, server, client):
-    #     ...
-    #     expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
-    #     for metric in expected:
-    #         assert metric in response.text
-    #
-    # @skip_v1 wouldn't work here because this is a module-level
-    # fixture - per-function decorators would have no effect
-    yield request.param
-
-
@pytest.fixture(scope="module")
 def default_server_args():
    return [
@ -63,13 +45,11 @@ def default_server_args():
        f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
    ],
 )
-def server(use_v1, default_server_args, request):
+def server(default_server_args, request):
    if request.param:
        default_server_args.append(request.param)
-    env_dict = dict(VLLM_USE_V1="1" if use_v1 else "0")
-    with RemoteOpenAIServer(
-        MODEL_NAME, default_server_args, env_dict=env_dict
-    ) as remote_server:
+
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
        yield remote_server


@ -129,7 +109,8 @@ EXPECTED_VALUES = {

@pytest.mark.asyncio
 async def test_metrics_counts(
-    server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
 ):
    for _ in range(_NUM_REQUESTS):
        # sending a request triggers the metrics to be logged.
@ -145,7 +126,7 @@ async def test_metrics_counts(

    # Loop over all expected metric_families
    for metric_family, suffix_values_list in EXPECTED_VALUES.items():
-        if (use_v1 and metric_family not in EXPECTED_METRICS_V1) or (
+        if (metric_family not in EXPECTED_METRICS_V1) or (
            not server.show_hidden_metrics
            and metric_family in HIDDEN_DEPRECATED_METRICS
        ):
@ -183,62 +164,6 @@ async def test_metrics_counts(
        assert found_metric, f"Did not find {metric_family} in prom endpoint"


-EXPECTED_METRICS = [
-    "vllm:num_requests_running",
-    "vllm:num_requests_waiting",
-    "vllm:gpu_cache_usage_perc",
-    "vllm:time_to_first_token_seconds_sum",
-    "vllm:time_to_first_token_seconds_bucket",
-    "vllm:time_to_first_token_seconds_count",
-    "vllm:time_per_output_token_seconds_sum",
-    "vllm:time_per_output_token_seconds_bucket",
-    "vllm:time_per_output_token_seconds_count",
-    "vllm:e2e_request_latency_seconds_sum",
-    "vllm:e2e_request_latency_seconds_bucket",
-    "vllm:e2e_request_latency_seconds_count",
-    "vllm:request_queue_time_seconds_sum",
-    "vllm:request_queue_time_seconds_bucket",
-    "vllm:request_queue_time_seconds_count",
-    "vllm:request_inference_time_seconds_sum",
-    "vllm:request_inference_time_seconds_bucket",
-    "vllm:request_inference_time_seconds_count",
-    "vllm:request_prefill_time_seconds_sum",
-    "vllm:request_prefill_time_seconds_bucket",
-    "vllm:request_prefill_time_seconds_count",
-    "vllm:request_decode_time_seconds_sum",
-    "vllm:request_decode_time_seconds_bucket",
-    "vllm:request_decode_time_seconds_count",
-    "vllm:request_prompt_tokens_sum",
-    "vllm:request_prompt_tokens_bucket",
-    "vllm:request_prompt_tokens_count",
-    "vllm:request_generation_tokens_sum",
-    "vllm:request_generation_tokens_bucket",
-    "vllm:request_generation_tokens_count",
-    "vllm:request_params_n_sum",
-    "vllm:request_params_n_bucket",
-    "vllm:request_params_n_count",
-    "vllm:request_params_max_tokens_sum",
-    "vllm:request_params_max_tokens_bucket",
-    "vllm:request_params_max_tokens_count",
-    "vllm:iteration_tokens_total",
-    "vllm:num_preemptions_total",
-    "vllm:prompt_tokens_total",
-    "vllm:generation_tokens_total",
-    "vllm:request_success_total",
-    "vllm:cache_config_info",
-    # labels in cache_config_info
-    "block_size",
-    "cache_dtype",
-    "cpu_offload_gb",
-    "enable_prefix_caching",
-    "gpu_memory_utilization",
-    "num_cpu_blocks",
-    "num_gpu_blocks",
-    "num_gpu_blocks_override",
-    "sliding_window",
-    "swap_space_bytes",
-]
-
 EXPECTED_METRICS_V1 = [
    "vllm:num_requests_running",
    "vllm:num_requests_waiting",
@ -304,17 +229,21 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [

@pytest.mark.asyncio
 async def test_metrics_exist(
-    server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
 ):
    # sending a request triggers the metrics to be logged.
    await client.completions.create(
-        model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0
+        model=MODEL_NAME,
+        prompt="Hello, my name is",
+        max_tokens=5,
+        temperature=0.0,
    )

    response = requests.get(server.url_for("metrics"))
    assert response.status_code == HTTPStatus.OK

-    for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS:
+    for metric in EXPECTED_METRICS_V1:
        if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
            continue
        assert metric in response.text
@ -322,10 +251,11 @@ async def test_metrics_exist(

@pytest.mark.asyncio
 async def test_abort_metrics_reset(
-    server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
+    server: RemoteOpenAIServer,
+    client: openai.AsyncClient,
 ):
    running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
-        server, use_v1
+        server
    )

    # Expect no running requests or kvcache usage
@ -351,7 +281,7 @@ async def test_abort_metrics_reset(

    # Check that we have running requests
    running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
-        server, use_v1
+        server
    )

    # Expect running requests and kvcache usage
@ -371,7 +301,7 @@ async def test_abort_metrics_reset(

    # Verify running and waiting requests counts and KV cache usage are zero
    running_requests_after, waiting_requests_after, kv_cache_usage_after = (
-        _get_running_metrics_from_api(server, use_v1)
+        _get_running_metrics_from_api(server)
    )

    assert running_requests_after == 0, (
@ -385,7 +315,7 @@ async def test_abort_metrics_reset(
    )


-def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
+def _get_running_metrics_from_api(server: RemoteOpenAIServer):
    """Return (running_count, waiting_count, kv_cache_usage)"""

    response = requests.get(server.url_for("metrics"))
@ -394,9 +324,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
    # Verify running and waiting requests counts and KV cache usage are zero
    running_requests, waiting_requests, kv_cache_usage = None, None, None

-    kv_cache_usage_metric = (
-        "vllm:kv_cache_usage_perc" if use_v1 else "vllm:gpu_cache_usage_perc"
-    )
+    kv_cache_usage_metric = "vllm:kv_cache_usage_perc"

    for family in text_string_to_metric_families(response.text):
        if family.name == "vllm:num_requests_running":
@ -422,7 +350,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
    return running_requests, waiting_requests, kv_cache_usage


-def test_metrics_exist_run_batch(use_v1: bool):
+def test_metrics_exist_run_batch():
    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""  # noqa: E501

    base_url = "0.0.0.0"
@ -452,7 +380,6 @@ def test_metrics_exist_run_batch(use_v1: bool):
                "--port",
                port,
            ],
-            env={"VLLM_USE_V1": "1"},
        )

        def is_server_up(url):
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@ -15,11 +15,6 @@ from vllm.entrypoints.renderer import BaseRenderer
 from ...utils import RemoteOpenAIServer


-@pytest.fixture(scope="function", autouse=True)
-def use_v1_only(monkeypatch):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
-
@pytest.mark.asyncio
 async def test_empty_prompt():
    model_name = "gpt2"
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@ -80,7 +80,6 @@ def test_env(
 ):
    """Test attention backend selection with valid device-backend pairs."""
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv(STR_BACKEND_ENV_VAR, name)
        m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")

@ -212,30 +211,21 @@ def test_env(


@pytest.mark.parametrize("device", ["cpu", "cuda"])
-def test_fp32_fallback(
-    device: str,
-    monkeypatch: pytest.MonkeyPatch,
-):
+def test_fp32_fallback(device: str):
    """Test attention backend selection with fp32."""
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    if device == "cpu":
+        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
+            backend = get_attn_backend(16, torch.float32, None, 16)
+        assert backend.get_name() == "TORCH_SDPA"

-        if device == "cpu":
-            with patch("vllm.attention.selector.current_platform", CpuPlatform()):
-                backend = get_attn_backend(16, torch.float32, None, 16)
-            assert backend.get_name() == "TORCH_SDPA"
-
-        elif device == "cuda":
-            with patch("vllm.attention.selector.current_platform", CudaPlatform()):
-                backend = get_attn_backend(16, torch.float32, None, 16)
-            assert backend.get_name() == "FLEX_ATTENTION"
+    elif device == "cuda":
+        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+            backend = get_attn_backend(16, torch.float32, None, 16)
+        assert backend.get_name() == "FLEX_ATTENTION"


 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
    """Test FlashAttn validation."""
-    # TODO: When testing for v1, pipe in `use_v1` as an argument to
-    # get_attn_backend
-
    pytest.skip(
        "Skipping as current backend selector does not "
        "handle fallbacks when a backend is set via env var."
@ -289,7 +279,6 @@ def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
        monkeypatch.context() as m,
        patch("vllm.attention.selector.current_platform", CudaPlatform()),
    ):
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)

        # Should raise ValueError for invalid backend
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):

    # Run with flex attention
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")

        set_seed(seed)
@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):

    # Run with default backend
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        set_seed(seed)
        with vllm_runner(
            model_name,
@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):

    # Run with flex attention
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
        with vllm_runner(
            model_name,
@ -126,17 +123,18 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
            flex_outputs = llm_flex.embed(prompts)

    # Run with default backend
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        with vllm_runner(
+    with (
+        monkeypatch.context() as m,
+        vllm_runner(
            model_name,
            runner="pooling",
            dtype=torch.bfloat16,
            tensor_parallel_size=1,
            max_model_len=100,
            enforce_eager=True,
-        ) as llm_default:
-            default_outputs = llm_default.embed(prompts)
+        ) as llm_default,
+    ):
+        default_outputs = llm_default.embed(prompts)

    check_embeddings_close(
        embeddings_0_lst=flex_outputs,
--- a/tests/models/multimodal/generation/test_maverick.py
+++ b/tests/models/multimodal/generation/test_maverick.py
@ -613,7 +613,6 @@ def test_dummy_maverick(
    profile: bool = False,
 ) -> None:
    # Disable multiprocessing allows us to access model executor from LLM engine
-    monkeypatch.setenv("VLLM_USE_V1", "1")
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

    model_path = create_reduced_maverick_model(
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@ -8,7 +8,6 @@ if TYPE_CHECKING:
    from vllm.config import VllmConfig
 else:
    VllmConfig = None
-from vllm import envs


 class DummyPlatform(Platform):
@ -19,10 +18,7 @@ class DummyPlatform(Platform):

    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        if envs.VLLM_USE_V1:
-            compilation_config = vllm_config.compilation_config
-            # Activate custom ops for v1.
-            compilation_config.custom_ops = ["all"]
+        vllm_config.compilation_config.custom_ops = ["all"]

    def get_attn_backend_cls(
        self,
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@ -16,7 +16,6 @@ class DummyV1Scheduler(Scheduler):

 def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        # Explicitly turn off engine multiprocessing so
        # that the scheduler runs in this process
        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@ -8,18 +8,11 @@ Run `pytest tests/samplers/test_no_bad_words.py`.

 from typing import Optional

-import pytest
 from transformers import AutoTokenizer

 from vllm import LLM, SamplingParams


-@pytest.fixture(autouse=True)
-def v1(monkeypatch):
-    """Only run on vLLM v1."""
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
-
 def _generate(
    llm: LLM,
    prompt: str,
--- a/tests/tpu/lora/test_lora.py
+++ b/tests/tpu/lora/test_lora.py
@ -17,17 +17,6 @@ from vllm.lora.request import LoRARequest
 # 100 training iterations with a training batch size of 100.


-@pytest.fixture(scope="function", autouse=True)
-def use_v1_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
-    for all tests in this file
-    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        yield
-
-
 def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
    return vllm.LLM(
        model="Qwen/Qwen2.5-3B-Instruct",
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@ -305,7 +305,6 @@ full_cg_backend_configs = {
    "CutlassMLA": BackendConfig(
        name="CutlassMLA",
        env_vars={
-            "VLLM_USE_V1": "1",
            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
            "FORCE_NUM_KV_SPLITS": "1",  # TODO: remove this when hang issue is fixed
        },
--- a/tests/v1/core/test_kv_sharing.py
+++ b/tests/v1/core/test_kv_sharing.py
@ -1,11 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import pytest
 import torch

 from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
 from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups

+pytestmark = pytest.mark.cpu_test
+

 def new_kv_cache_spec():
    return FullAttentionSpec(16, 1, 1, torch.float32, False)
--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
@ -1,14 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os

 import pytest

 from vllm import LLM

-if os.getenv("VLLM_USE_V1", "0") != "1":
-    pytest.skip("Test package requires V1", allow_module_level=True)
-
 MODEL = "meta-llama/Llama-3.2-1B"
 PROMPT = "Hello my name is Robert and I"

--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
    ):
        pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")

-    env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
+    env_vars = backend_configs[backend_name].env_vars

    with temporary_environ(env_vars), ExitStack() as stack:
        if not supported:
@ -117,7 +117,7 @@ combo_cases_2 = [
 def test_cudagraph_compilation_combo(combo_case):
    backend_name, cudagraph_mode, compilation_level, supported = combo_case

-    env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
+    env_vars = backend_configs[backend_name].env_vars

    with temporary_environ(env_vars), ExitStack() as stack:
        if not supported:
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
        )

    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)

        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@ -32,7 +32,7 @@ model_config = {
@pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
 def test_sliding_window_retrieval(
-    monkeypatch, model, batch_size, seed, disable_hybrid_kv_cache_manager
+    model, batch_size, seed, disable_hybrid_kv_cache_manager
 ):
    """
    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
@ -40,39 +40,34 @@ def test_sliding_window_retrieval(
    If we tell it upfront which we are going to be looking for, then
    it answers correctly (mostly).
    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    test_config = model_config[model]

-        test_config = model_config[model]
+    llm = LLM(
+        model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
+    )
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=100)

-        llm = LLM(
-            model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
-        )
-        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+    prompts, answer, indices = prep_prompts(batch_size, ln_range=test_config.ln_range)

-        prompts, answer, indices = prep_prompts(
-            batch_size, ln_range=test_config.ln_range
-        )
+    check_length(prompts, llm, test_config.sliding_window)

-        check_length(prompts, llm, test_config.sliding_window)
+    # Fresh generation
+    responses = llm.generate(prompts, sampling_params)
+    check_answers(
+        indices,
+        answer,
+        [response.outputs[0].text for response in responses],
+        accept_rate=1.0,
+    )

-        # Fresh generation
-        responses = llm.generate(prompts, sampling_params)
-        check_answers(
-            indices,
-            answer,
-            [response.outputs[0].text for response in responses],
-            accept_rate=1.0,
-        )
-
-        # Re-generate with the same prompts to test prefix caching
-        responses = llm.generate(prompts, sampling_params)
-        check_answers(
-            indices,
-            answer,
-            [response.outputs[0].text for response in responses],
-            accept_rate=1.0,
-        )
+    # Re-generate with the same prompts to test prefix caching
+    responses = llm.generate(prompts, sampling_params)
+    check_answers(
+        indices,
+        answer,
+        [response.outputs[0].text for response in responses],
+        accept_rate=1.0,
+    )


 def check_length(prompts: list[str], llm: LLM, sliding_window: int):
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
    )

    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
        # Make scheduling deterministic for reproducibility
        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

--- a/tests/v1/e2e/test_min_tokens.py
+++ b/tests/v1/e2e/test_min_tokens.py
@ -13,7 +13,6 @@ Covers:
 5) Multiple stop conditions
 """

-import os
 from typing import Optional, Union

 import pytest
@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [
@pytest.fixture(scope="module")
 def llm_v1():
    """Create V1 LLM instance for testing"""
-    # Ensure V1 engine is used
-    os.environ["VLLM_USE_V1"] = "1"
-
    llm = LLM(
        model=TEST_MODEL,
        tensor_parallel_size=1,
@ -503,6 +499,6 @@ if __name__ == "__main__":
    
    Usage:
        cd vllm/
-        VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v
+        python -m pytest tests/v1/e2e/test_min_tokens.py -v
    """
    pytest.main([__file__, "-v"])
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@ -301,7 +301,6 @@ def test_mtp_correctness(
    model_setup: (method, model_name, tp_size)
    """
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_MLA_DISABLE", "1")

        method, model_name, tp_size = model_setup
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@ -95,17 +95,11 @@ async def generate(
 )
@pytest.mark.asyncio
 async def test_load(
-    monkeypatch: pytest.MonkeyPatch,
    output_kind: RequestOutputKind,
    engine_args: AsyncEngineArgs,
    prompt: PromptType,
 ):
-    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
-    # so that in the future when we switch, we don't have to change all the
-    # tests.
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(engine_args)
        after.callback(engine.shutdown)
@ -149,14 +143,11 @@ async def test_load(
 )
@pytest.mark.asyncio
 async def test_abort(
-    monkeypatch: pytest.MonkeyPatch,
    output_kind: RequestOutputKind,
    engine_args: AsyncEngineArgs,
    prompt: PromptType,
 ):
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(engine_args)
        after.callback(engine.shutdown)
@ -222,13 +213,8 @@ async def test_abort(
    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
 )
@pytest.mark.asyncio
-async def test_multi_abort(
-    monkeypatch: pytest.MonkeyPatch,
-    output_kind: RequestOutputKind,
-):
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+async def test_multi_abort(output_kind: RequestOutputKind):
+    with ExitStack() as after:
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
        after.callback(engine.shutdown)
@ -304,14 +290,11 @@ async def test_multi_abort(
 )
@pytest.mark.asyncio
 async def test_finished_flag(
-    monkeypatch: pytest.MonkeyPatch,
    n: int,
    engine_args: AsyncEngineArgs,
    prompt: PromptType,
 ):
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(engine_args)
        after.callback(engine.shutdown)
@ -341,12 +324,10 @@ async def test_finished_flag(
 )
@pytest.mark.asyncio
 async def test_mid_stream_cancellation(
-    monkeypatch: pytest.MonkeyPatch, engine_args: AsyncEngineArgs, prompt: PromptType
+    engine_args: AsyncEngineArgs, prompt: PromptType
 ):
    """Test that requests can be cancelled mid-stream."""
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(engine_args)
        after.callback(engine.shutdown)
@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch):
    be added to the default loggers.
    """

-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(
                TEXT_ENGINE_ARGS,
@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch):


@pytest.mark.asyncio(scope="module")
-async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+async def test_dp_rank_argument():
+    with ExitStack() as after:
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
        after.callback(engine.shutdown)
@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):


@pytest.mark.asyncio
-async def test_check_health(monkeypatch: pytest.MonkeyPatch):
+async def test_check_health():
    """Test that check_health returns normally for healthy engine
    and raises EngineDeadError when the engine is dead.
    """
@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):

    from vllm.v1.engine.exceptions import EngineDeadError

-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
        after.callback(engine.shutdown)
@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
 )
@pytest.mark.asyncio
-async def test_abort_final_output(
-    monkeypatch: pytest.MonkeyPatch,
-    output_kind: RequestOutputKind,
-):
+async def test_abort_final_output(output_kind: RequestOutputKind):
    """Test that abort() returns a final output with correct information."""

-    with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
-
+    with ExitStack() as after:
        with set_default_torch_num_threads(1):
            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
        after.callback(engine.shutdown)
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@ -5,18 +5,11 @@ from argparse import ArgumentError

 import pytest

-from vllm import envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser

-if not envs.VLLM_USE_V1:
-    pytest.skip(
-        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
-        allow_module_level=True,
-    )
-

 def test_prefix_caching_from_cli():
    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@ -46,188 +46,184 @@ def make_request() -> EngineCoreRequest:


@create_new_process_for_each_test()
-def test_engine_core(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        """Setup the EngineCore."""
-        engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
-        executor_class = Executor.get_class(vllm_config)
+def test_engine_core():
+    """Setup the EngineCore."""
+    engine_args = EngineArgs(model=MODEL_NAME)
+    vllm_config = engine_args.create_engine_config()
+    executor_class = Executor.get_class(vllm_config)

-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(
-                vllm_config=vllm_config, executor_class=executor_class, log_stats=True
-            )
-        """Test basic request lifecycle."""
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+        )
+    """Test basic request lifecycle."""

-        # First request.
-        engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 0
+    # First request.
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 0

-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 1
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 1

-        # Second request.
-        engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 1
+    # Second request.
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 1

-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 2

-        # Add two requests in a row.
-        engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
-        engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
-        assert len(engine_core.scheduler.waiting) == 2
-        assert len(engine_core.scheduler.running) == 2
+    # Add two requests in a row.
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
+    assert len(engine_core.scheduler.waiting) == 2
+    assert len(engine_core.scheduler.running) == 2

-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 4
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 4

-        # Loop through until they are all done.
-        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-            pass
+    # Loop through until they are all done.
+    while (outs := engine_core.step()[0].get(0)) and outs.outputs:
+        pass

-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
-        """Test abort cycle."""
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0
+    """Test abort cycle."""

-        # Basic abort.
-        req = make_request()
-        request_id = req.request_id
+    # Basic abort.
+    req = make_request()
+    request_id = req.request_id

-        engine_core.add_request(*engine_core.preprocess_add_request(req))
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 0
-        assert engine_core.scheduler.has_unfinished_requests()
-        assert not engine_core.scheduler.has_finished_requests()
+    engine_core.add_request(*engine_core.preprocess_add_request(req))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 0
+    assert engine_core.scheduler.has_unfinished_requests()
+    assert not engine_core.scheduler.has_finished_requests()

-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 1
-        assert engine_core.scheduler.has_unfinished_requests()
-        assert not engine_core.scheduler.has_finished_requests()
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 1
+    assert engine_core.scheduler.has_unfinished_requests()
+    assert not engine_core.scheduler.has_finished_requests()

-        engine_core.abort_requests([request_id])
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
-        assert not engine_core.scheduler.has_unfinished_requests()
-        assert engine_core.scheduler.has_finished_requests()
+    engine_core.abort_requests([request_id])
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0
+    assert not engine_core.scheduler.has_unfinished_requests()
+    assert engine_core.scheduler.has_finished_requests()

-        _ = engine_core.step()
-        assert not engine_core.scheduler.has_unfinished_requests()
-        assert not engine_core.scheduler.has_finished_requests()
+    _ = engine_core.step()
+    assert not engine_core.scheduler.has_unfinished_requests()
+    assert not engine_core.scheduler.has_finished_requests()

-        # Add, step, abort 1 of the 3.
-        req0 = make_request()
-        req1 = make_request()
-        req2 = make_request()
+    # Add, step, abort 1 of the 3.
+    req0 = make_request()
+    req1 = make_request()
+    req2 = make_request()

-        engine_core.add_request(*engine_core.preprocess_add_request(req0))
-        engine_core.add_request(*engine_core.preprocess_add_request(req1))
-        assert len(engine_core.scheduler.waiting) == 2
-        assert len(engine_core.scheduler.running) == 0
+    engine_core.add_request(*engine_core.preprocess_add_request(req0))
+    engine_core.add_request(*engine_core.preprocess_add_request(req1))
+    assert len(engine_core.scheduler.waiting) == 2
+    assert len(engine_core.scheduler.running) == 0

-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 2

-        engine_core.add_request(*engine_core.preprocess_add_request(req2))
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 2
+    engine_core.add_request(*engine_core.preprocess_add_request(req2))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 2

-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 3
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 3

-        # Abort just one.
-        engine_core.abort_requests([req1.request_id])
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
+    # Abort just one.
+    engine_core.abort_requests([req1.request_id])
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 2

-        _ = engine_core.step()
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 2
+    _ = engine_core.step()
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 2

-        # Abort the other requests at the same time.
-        engine_core.abort_requests([req2.request_id, req0.request_id])
-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
+    # Abort the other requests at the same time.
+    engine_core.abort_requests([req2.request_id, req0.request_id])
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0

-        # Sending duplicate requests with same request_id
-        req0 = make_request()
-        req1 = make_request()
-        req0.request_id = req1.request_id = "test"
-        engine_core.add_request(*engine_core.preprocess_add_request(req0))
+    # Sending duplicate requests with same request_id
+    req0 = make_request()
+    req1 = make_request()
+    req0.request_id = req1.request_id = "test"
+    engine_core.add_request(*engine_core.preprocess_add_request(req0))

-        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-            pass
+    while (outs := engine_core.step()[0].get(0)) and outs.outputs:
+        pass

-        engine_core.add_request(*engine_core.preprocess_add_request(req1))
-        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-            pass
+    engine_core.add_request(*engine_core.preprocess_add_request(req1))
+    while (outs := engine_core.step()[0].get(0)) and outs.outputs:
+        pass

-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
+    assert len(engine_core.scheduler.waiting) == 0
+    assert len(engine_core.scheduler.running) == 0


@create_new_process_for_each_test()
-def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_advanced_sampling():
    """
    A basic end-to-end test to verify that the engine functions correctly
    when additional sampling parameters, such as top_p, min_tokens, and
    presence_penalty, are set.
    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        """Setup the EngineCore."""
-        engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
-        executor_class = Executor.get_class(vllm_config)
+    """Setup the EngineCore."""
+    engine_args = EngineArgs(model=MODEL_NAME)
+    vllm_config = engine_args.create_engine_config()
+    executor_class = Executor.get_class(vllm_config)

-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(
-                vllm_config=vllm_config, executor_class=executor_class, log_stats=True
-            )
-        """Test basic request lifecycle."""
-        # First request.
-        request: EngineCoreRequest = make_request()
-        request.sampling_params = SamplingParams(
-            min_tokens=4,
-            presence_penalty=1.0,
-            frequency_penalty=1.0,
-            repetition_penalty=0.1,
-            stop_token_ids=[1001, 1002],
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
        )
-        engine_core.add_request(*engine_core.preprocess_add_request(request))
+    """Test basic request lifecycle."""
+    # First request.
+    request: EngineCoreRequest = make_request()
+    request.sampling_params = SamplingParams(
+        min_tokens=4,
+        presence_penalty=1.0,
+        frequency_penalty=1.0,
+        repetition_penalty=0.1,
+        stop_token_ids=[1001, 1002],
+    )
+    engine_core.add_request(*engine_core.preprocess_add_request(request))

-        def _check_engine_state():
-            assert len(engine_core.scheduler.waiting) == 1
-            assert len(engine_core.scheduler.running) == 0
-            # Loop through until they are all done.
-            while (outs := engine_core.step()[0].get(0)) and outs.outputs:
-                pass
-            assert len(engine_core.scheduler.waiting) == 0
-            assert len(engine_core.scheduler.running) == 0
+    def _check_engine_state():
+        assert len(engine_core.scheduler.waiting) == 1
+        assert len(engine_core.scheduler.running) == 0
+        # Loop through until they are all done.
+        while (outs := engine_core.step()[0].get(0)) and outs.outputs:
+            pass
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 0

-        _check_engine_state()
+    _check_engine_state()

-        # Second request.
-        request2 = make_request()
-        request2.sampling_params = SamplingParams(
-            top_p=0.99,
-            top_k=50,
-        )
-        engine_core.add_request(*engine_core.preprocess_add_request(request2))
-        _check_engine_state()
+    # Second request.
+    request2 = make_request()
+    request2.sampling_params = SamplingParams(
+        top_p=0.99,
+        top_k=50,
+    )
+    engine_core.add_request(*engine_core.preprocess_add_request(request2))
+    _check_engine_state()


@create_new_process_for_each_test()
-def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_concurrent_batches():
    """
    Test that the engine can handle multiple concurrent batches.
    """
@ -272,173 +268,163 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
            if hasattr(self, "thread_pool"):
                self.thread_pool.shutdown(wait=False)

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        engine_args = EngineArgs(
-            model=MODEL_NAME,
-            # To test concurrent batches.
-            max_num_seqs=2,
-            # Avoid all requests being scheduled once.
-            enable_prefix_caching=False,
-            max_num_batched_tokens=10,
-            # Reduce startup time.
-            enforce_eager=True,
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        # To test concurrent batches.
+        max_num_seqs=2,
+        # Avoid all requests being scheduled once.
+        enable_prefix_caching=False,
+        max_num_batched_tokens=10,
+        # Reduce startup time.
+        enforce_eager=True,
+    )
+    vllm_config = engine_args.create_engine_config()
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor
        )
-        vllm_config = engine_args.create_engine_config()
-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(
-                vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor
+    assert engine_core.batch_queue is not None
+
+    # Add two requests in a row. Each request have 12 prompt tokens.
+    req0 = make_request_with_max_tokens("0", 5)
+    engine_core.add_request(*engine_core.preprocess_add_request(req0))
+    req1 = make_request_with_max_tokens("1", 5)
+    engine_core.add_request(*engine_core.preprocess_add_request(req1))
+
+    # Schedule Batch 1: (10, req0)
+    assert engine_core.step_with_batch_queue()[0] is None
+    assert len(engine_core.batch_queue) == 1
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["0"] == 10
+    # num_computed_tokens should have been updated immediately.
+    assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10
+
+    # Schedule Batch 2: (2, req0), (8, req1)
+    assert engine_core.step_with_batch_queue()[0] == {}
+    assert len(engine_core.batch_queue) == 1
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["0"] == 2
+    assert scheduler_output.num_scheduled_tokens["1"] == 8
+    # num_computed_tokens should have been updated immediately.
+    assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
+    assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
+
+    assert engine_core.scheduler.get_num_unfinished_requests() == 2
+
+    # Finish Batch 1 and schedule Batch 3: (4, req1).
+    # Note that req0 cannot be scheduled
+    # because it is in the decoding stage now.
+    engine_core.step_with_batch_queue()
+    assert len(engine_core.batch_queue) == 1
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["1"] == 4
+
+    # Finish Batch 2. Get first token of req0.
+    # Schedule Batch 4: (1, req0).
+    output = engine_core.step_with_batch_queue()[0].get(0)
+    assert output is not None
+    assert len(output.outputs) == 1
+    assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["0"] == 1
+
+    # Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
+    output = engine_core.step_with_batch_queue()[0].get(0)
+    assert output is not None
+    assert len(output.outputs) == 1
+    assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
+    scheduler_output = engine_core.batch_queue[-1][1]
+    assert scheduler_output.num_scheduled_tokens["1"] == 1
+
+    # Loop until req0 is finished.
+    req_id = 0
+    expected_num_tokens = [
+        engine_core.scheduler.requests["0"].num_tokens + 1,
+        engine_core.scheduler.requests["1"].num_tokens + 1,
+    ]
+    while engine_core.scheduler.get_num_unfinished_requests() == 2:
+        output = engine_core.step_with_batch_queue()[0]
+        # Every step consumes an output.
+        assert output is not None
+        assert len(output[0].outputs) == 1
+        if req_id in engine_core.scheduler.requests:
+            assert (
+                engine_core.scheduler.requests[req_id].num_tokens
+                == expected_num_tokens[req_id]
            )
-        assert engine_core.batch_queue is not None
-
-        # Add two requests in a row. Each request have 12 prompt tokens.
-        req0 = make_request_with_max_tokens("0", 5)
-        engine_core.add_request(*engine_core.preprocess_add_request(req0))
-        req1 = make_request_with_max_tokens("1", 5)
-        engine_core.add_request(*engine_core.preprocess_add_request(req1))
-
-        # Schedule Batch 1: (10, req0)
-        assert engine_core.step_with_batch_queue()[0] is None
-        assert len(engine_core.batch_queue) == 1
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["0"] == 10
-        # num_computed_tokens should have been updated immediately.
-        assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10
-
-        # Schedule Batch 2: (2, req0), (8, req1)
-        assert engine_core.step_with_batch_queue()[0] == {}
-        assert len(engine_core.batch_queue) == 1
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["0"] == 2
-        assert scheduler_output.num_scheduled_tokens["1"] == 8
-        # num_computed_tokens should have been updated immediately.
-        assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
-        assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
-
-        assert engine_core.scheduler.get_num_unfinished_requests() == 2
-
-        # Finish Batch 1 and schedule Batch 3: (4, req1).
-        # Note that req0 cannot be scheduled
-        # because it is in the decoding stage now.
-        engine_core.step_with_batch_queue()
-        assert len(engine_core.batch_queue) == 1
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["1"] == 4
-
-        # Finish Batch 2. Get first token of req0.
-        # Schedule Batch 4: (1, req0).
-        output = engine_core.step_with_batch_queue()[0].get(0)
-        assert output is not None
-        assert len(output.outputs) == 1
-        assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["0"] == 1
-
-        # Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
-        output = engine_core.step_with_batch_queue()[0].get(0)
-        assert output is not None
-        assert len(output.outputs) == 1
-        assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
-        scheduler_output = engine_core.batch_queue[-1][1]
-        assert scheduler_output.num_scheduled_tokens["1"] == 1
-
-        # Loop until req0 is finished.
-        req_id = 0
-        expected_num_tokens = [
-            engine_core.scheduler.requests["0"].num_tokens + 1,
-            engine_core.scheduler.requests["1"].num_tokens + 1,
-        ]
-        while engine_core.scheduler.get_num_unfinished_requests() == 2:
-            output = engine_core.step_with_batch_queue()[0]
-            # Every step consumes an output.
-            assert output is not None
-            assert len(output[0].outputs) == 1
-            if req_id in engine_core.scheduler.requests:
-                assert (
-                    engine_core.scheduler.requests[req_id].num_tokens
-                    == expected_num_tokens[req_id]
-                )
-            expected_num_tokens[req_id] += 1
-            req_id = (req_id + 1) % 2
+        expected_num_tokens[req_id] += 1
+        req_id = (req_id + 1) % 2


@multi_gpu_test(num_gpus=2)
-def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_tp():
    """
    Test engine can initialize worker in tp properly
    """

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        """Setup the EngineCore."""
-        engine_args = EngineArgs(
-            model=MODEL_NAME,
-            tensor_parallel_size=2,
-            # Reduce startup time.
-            enforce_eager=True,
-        )
-        vllm_config = engine_args.create_engine_config()
-        executor_class = Executor.get_class(vllm_config)
+    """Setup the EngineCore."""
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        tensor_parallel_size=2,
+        # Reduce startup time.
+        enforce_eager=True,
+    )
+    vllm_config = engine_args.create_engine_config()
+    executor_class = Executor.get_class(vllm_config)

-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(
-                vllm_config=vllm_config, executor_class=executor_class, log_stats=True
-            )
-
-        def get_worker_cache_config_field(worker, key: str):
-            return getattr(worker.cache_config, key)
-
-        num_gpu_blocks = engine_core.collective_rpc(
-            get_worker_cache_config_field, args=("num_gpu_blocks",)
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
        )
-        num_cpu_blocks = engine_core.collective_rpc(
-            get_worker_cache_config_field, args=("num_cpu_blocks",)
-        )
-        assert all(x is not None for x in num_gpu_blocks)
-        assert all(x is not None for x in num_cpu_blocks)
+
+    def get_worker_cache_config_field(worker, key: str):
+        return getattr(worker.cache_config, key)
+
+    num_gpu_blocks = engine_core.collective_rpc(
+        get_worker_cache_config_field, args=("num_gpu_blocks",)
+    )
+    num_cpu_blocks = engine_core.collective_rpc(
+        get_worker_cache_config_field, args=("num_cpu_blocks",)
+    )
+    assert all(x is not None for x in num_gpu_blocks)
+    assert all(x is not None for x in num_cpu_blocks)


@create_new_process_for_each_test()
-def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch):
+def test_engine_core_invalid_request_id_type():
    """Test that engine raises TypeError for non-string request_id."""
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    engine_args = EngineArgs(model=MODEL_NAME)
+    vllm_config = engine_args.create_engine_config()
+    executor_class = Executor.get_class(vllm_config)

-        engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config()
-        executor_class = Executor.get_class(vllm_config)
+    with set_default_torch_num_threads(1):
+        engine_core = EngineCore(
+            vllm_config=vllm_config, executor_class=executor_class, log_stats=True
+        )

-        with set_default_torch_num_threads(1):
-            engine_core = EngineCore(
-                vllm_config=vllm_config, executor_class=executor_class, log_stats=True
-            )
+    # Test with UUID object (common mistake)
+    uuid_request = make_request()
+    uuid_request.request_id = uuid.uuid4()  # UUID object instead of string

-        # Test with UUID object (common mistake)
-        uuid_request = make_request()
-        uuid_request.request_id = uuid.uuid4()  # UUID object instead of string
+    with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"):
+        engine_core.add_request(*engine_core.preprocess_add_request(uuid_request))

-        with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"):
-            engine_core.add_request(*engine_core.preprocess_add_request(uuid_request))
+    # Test with integer
+    int_request = make_request()
+    int_request.request_id = 12345

-        # Test with integer
-        int_request = make_request()
-        int_request.request_id = 12345
+    with pytest.raises(TypeError, match="request_id must be a string, got.*int"):
+        engine_core.add_request(*engine_core.preprocess_add_request(int_request))

-        with pytest.raises(TypeError, match="request_id must be a string, got.*int"):
-            engine_core.add_request(*engine_core.preprocess_add_request(int_request))
+    # Test with None
+    none_request = make_request()
+    none_request.request_id = None

-        # Test with None
-        none_request = make_request()
-        none_request.request_id = None
+    with pytest.raises(TypeError, match="request_id must be a string, got.*NoneType"):
+        engine_core.add_request(*engine_core.preprocess_add_request(none_request))

-        with pytest.raises(
-            TypeError, match="request_id must be a string, got.*NoneType"
-        ):
-            engine_core.add_request(*engine_core.preprocess_add_request(none_request))
-
-        # Verify engine is still functional after errors
-        valid_request = make_request()
-        engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 0
+    # Verify engine is still functional after errors
+    valid_request = make_request()
+    engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
+    assert len(engine_core.scheduler.waiting) == 1
+    assert len(engine_core.scheduler.running) == 0
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@ -130,8 +130,6 @@ def test_engine_core_client(
    monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool
 ):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
        # Monkey-patch core engine utility function to test.
        m.setattr(EngineCore, "echo", echo, raising=False)

@ -218,8 +216,6 @@ def test_engine_core_client(
@pytest.mark.asyncio(loop_scope="function")
 async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
        # Monkey-patch core engine utility function to test.
        m.setattr(EngineCore, "echo", echo, raising=False)

@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return(
    monkeypatch: pytest.MonkeyPatch,
 ):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
        # Must set insecure serialization to allow returning custom types.
        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")

@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return(
    monkeypatch: pytest.MonkeyPatch,
 ):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
        # Must set insecure serialization to allow returning custom types.
        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")

@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures(
    monkeypatch: pytest.MonkeyPatch,
 ):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
        # Must set insecure serialization to allow returning custom types.
        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")

@ -592,76 +582,71 @@ async def test_engine_core_client_util_method_nested_structures(
    indirect=["publisher_config"],
 )
 def test_kv_cache_events(
-    monkeypatch: pytest.MonkeyPatch,
    multiprocessing_mode: bool,
    publisher_config,
 ):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        block_size = 16
-        num_blocks = 2
+    block_size = 16
+    num_blocks = 2

-        engine_args = EngineArgs(
-            model=MODEL_NAME,
-            enforce_eager=True,
-            enable_prefix_caching=True,
-            block_size=block_size,
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        block_size=block_size,
+    )
+    engine_args.kv_events_config = publisher_config
+
+    vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
+
+    executor_class = Executor.get_class(vllm_config)
+    with set_default_torch_num_threads(1):
+        client = EngineCoreClient.make_client(
+            multiprocess_mode=multiprocessing_mode,
+            asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=False,
        )
-        engine_args.kv_events_config = publisher_config
+    endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
+    subscriber = MockSubscriber(
+        endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
+    )

-        vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
+    try:
+        custom_tokens = list(range(num_blocks * block_size))
+        sampling_params = SamplingParams(max_tokens=1)
+        request = make_request(sampling_params, custom_tokens)
+        client.add_request(request)

-        executor_class = Executor.get_class(vllm_config)
-        with set_default_torch_num_threads(1):
-            client = EngineCoreClient.make_client(
-                multiprocess_mode=multiprocessing_mode,
-                asyncio_mode=False,
-                vllm_config=vllm_config,
-                executor_class=executor_class,
-                log_stats=False,
-            )
-        endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
-        subscriber = MockSubscriber(
-            endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
+        outputs: dict[str, list] = {request.request_id: []}
+        loop_until_done(client, outputs)
+
+        result = subscriber.receive_one(timeout=1000)
+        assert result is not None, "No message received"
+
+        seq, received = result
+
+        assert seq == 0, "Sequence number mismatch"
+        assert len(received.events) == 1, "We should have exactly one BlockStored event"
+        event = received.events[0]
+        assert isinstance(event, BlockStored), "We should have a BlockStored event"
+        assert len(event.block_hashes) == num_blocks, (
+            "We should have a BlockStored event with 2 block_hashes"
        )
-
-        try:
-            custom_tokens = list(range(num_blocks * block_size))
-            sampling_params = SamplingParams(max_tokens=1)
-            request = make_request(sampling_params, custom_tokens)
-            client.add_request(request)
-
-            outputs: dict[str, list] = {request.request_id: []}
-            loop_until_done(client, outputs)
-
-            result = subscriber.receive_one(timeout=1000)
-            assert result is not None, "No message received"
-
-            seq, received = result
-
-            assert seq == 0, "Sequence number mismatch"
-            assert len(received.events) == 1, (
-                "We should have exactly one BlockStored event"
-            )
-            event = received.events[0]
-            assert isinstance(event, BlockStored), "We should have a BlockStored event"
-            assert len(event.block_hashes) == num_blocks, (
-                "We should have a BlockStored event with 2 block_hashes"
-            )
-            assert event.block_size == block_size, (
-                "Block size should be the same as the block size"
-            )
-            assert event.parent_block_hash is None, "Parent block hash should be None"
-            assert event.lora_id is None, "Lora id should be None"
-            assert len(event.token_ids) == num_blocks * block_size, (
-                "Token ids should be the same as the custom tokens"
-            )
-            assert event.token_ids == custom_tokens, (
-                "Token ids should be the same as the custom tokens"
-            )
-        finally:
-            client.shutdown()
-            subscriber.close()
+        assert event.block_size == block_size, (
+            "Block size should be the same as the block size"
+        )
+        assert event.parent_block_hash is None, "Parent block hash should be None"
+        assert event.lora_id is None, "Lora id should be None"
+        assert len(event.token_ids) == num_blocks * block_size, (
+            "Token ids should be the same as the custom tokens"
+        )
+        assert event.token_ids == custom_tokens, (
+            "Token ids should be the same as the custom tokens"
+        )
+    finally:
+        client.shutdown()
+        subscriber.close()


@pytest.mark.asyncio
@ -672,101 +657,96 @@ def test_kv_cache_events(
 )
@multi_gpu_test(num_gpus=4)
 async def test_kv_cache_events_dp(
-    monkeypatch: pytest.MonkeyPatch,
    multiprocessing_mode: bool,
    publisher_config,
 ):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        block_size = 16
-        num_blocks = 2
-        dp_size = 2
-        tp_size = 2
+    block_size = 16
+    num_blocks = 2
+    dp_size = 2
+    tp_size = 2

-        engine_args = EngineArgs(
-            model=MODEL_NAME,
-            enforce_eager=True,
-            enable_prefix_caching=True,
-            data_parallel_size=dp_size,
-            tensor_parallel_size=tp_size,
-            block_size=block_size,
+    engine_args = EngineArgs(
+        model=MODEL_NAME,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        data_parallel_size=dp_size,
+        tensor_parallel_size=tp_size,
+        block_size=block_size,
+    )
+    engine_args.kv_events_config = publisher_config
+
+    vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
+
+    executor_class = Executor.get_class(vllm_config)
+    with set_default_torch_num_threads(1):
+        client = EngineCoreClient.make_client(
+            multiprocess_mode=multiprocessing_mode,
+            asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=False,
        )
-        engine_args.kv_events_config = publisher_config
+    await asyncio.sleep(1)

-        vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
+    # Build endpoints for all DP ranks
+    base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
+    endpoints = []
+    for i in range(dp_size):
+        offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
+        endpoints.append(offset_endpoint)

-        executor_class = Executor.get_class(vllm_config)
-        with set_default_torch_num_threads(1):
-            client = EngineCoreClient.make_client(
-                multiprocess_mode=multiprocessing_mode,
-                asyncio_mode=True,
-                vllm_config=vllm_config,
-                executor_class=executor_class,
-                log_stats=False,
-            )
-        await asyncio.sleep(1)
+    subscriber = MockSubscriber(
+        endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
+    )

-        # Build endpoints for all DP ranks
-        base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
-        endpoints = []
-        for i in range(dp_size):
-            offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
-            endpoints.append(offset_endpoint)
+    try:
+        custom_tokens = list(range(num_blocks * block_size))
+        sampling_params = SamplingParams(max_tokens=1)
+        all_request_ids = []

-        subscriber = MockSubscriber(
-            endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
+        # Create and add 25 requests
+        # NOTE: attempts to force routing to both dp groups but can be flaky
+        for i in range(25):
+            await asyncio.sleep(0.01)
+            request = make_request(sampling_params, custom_tokens)
+            await client.add_request_async(request)
+            all_request_ids.append(request.request_id)
+
+        await asyncio.sleep(0.1)
+
+        # Initialize outputs dict for all requests
+        outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
+
+        print("processing requests...")
+        await asyncio.wait_for(
+            loop_until_fully_done_async(client, outputs), timeout=20.0
        )

-        try:
-            custom_tokens = list(range(num_blocks * block_size))
-            sampling_params = SamplingParams(max_tokens=1)
-            all_request_ids = []
+        # Receive from subscriber until no more messages
+        print("collecting results...")
+        results = []
+        while True:
+            result = subscriber.receive_one(timeout=1)
+            print(result)
+            if result is None:
+                break
+            results.append(result)

-            # Create and add 25 requests
-            # NOTE: attempts to force routing to both dp groups but can be flaky
-            for i in range(25):
-                await asyncio.sleep(0.01)
-                request = make_request(sampling_params, custom_tokens)
-                await client.add_request_async(request)
-                all_request_ids.append(request.request_id)
+        # Collect all events and data_parallel_ranks from all results
+        all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
+        unique_dps = set(all_dp_ranks)
+        assert len(unique_dps) == 2, (
+            f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
+        )

-            await asyncio.sleep(0.1)
-
-            # Initialize outputs dict for all requests
-            outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
-
-            print("processing requests...")
-            await asyncio.wait_for(
-                loop_until_fully_done_async(client, outputs), timeout=20.0
-            )
-
-            # Receive from subscriber until no more messages
-            print("collecting results...")
-            results = []
-            while True:
-                result = subscriber.receive_one(timeout=1)
-                print(result)
-                if result is None:
-                    break
-                results.append(result)
-
-            # Collect all events and data_parallel_ranks from all results
-            all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
-            unique_dps = set(all_dp_ranks)
-            assert len(unique_dps) == 2, (
-                f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
-            )
-
-        finally:
-            client.shutdown()
-            subscriber.close()
+    finally:
+        client.shutdown()
+        subscriber.close()


@pytest.mark.timeout(20)
 def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
-        m.setenv("VLLM_USE_V1", "1")
-
        # Monkey-patch to extract core process pid while it's starting.
        core_proc_pid = [None]
        cepm_ctor = CoreEngineProcManager.__init__
@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat
    mock_executor_class.side_effect = create_mock_executor

    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("CUDA_VISIBLE_DEVICES", "")  # No CUDA devices

        from vllm.v1.engine.utils import EngineZmqAddresses
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@ -21,12 +21,10 @@ DTYPE = "half"
 def _vllm_model(
    apc: bool,
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    *,
    skip_tokenizer_init: bool = False,
 ):
    """Set up VllmRunner instance."""
-    monkeypatch.setenv("VLLM_USE_V1", "1")
    return vllm_runner(
        MODEL,
        dtype=DTYPE,
@ -45,16 +43,16 @@ def _vllm_model(
    # Prefix caching
    params=[False, True],
 )
-def vllm_model(vllm_runner, request, monkeypatch):
+def vllm_model(vllm_runner, request):
    """VllmRunner test fixture parameterized by APC True/False."""
-    with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model:
+    with _vllm_model(request.param, vllm_runner) as vllm_model:
        yield vllm_model


@pytest.fixture(scope="function")
-def vllm_model_apc(vllm_runner, monkeypatch):
+def vllm_model_apc(vllm_runner):
    """VllmRunner test fixture with APC."""
-    with _vllm_model(True, vllm_runner, monkeypatch) as vllm_model:
+    with _vllm_model(True, vllm_runner) as vllm_model:
        yield vllm_model


@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch):
    # Prefix caching
    params=[False, True],
 )
-def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
+def vllm_model_skip_tokenizer_init(vllm_runner, request):
    """VllmRunner test fixture with APC."""
    with _vllm_model(
        request.param,
        vllm_runner,
-        monkeypatch,
        skip_tokenizer_init=True,
    ) as vllm_model:
        yield vllm_model
@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
            )


-def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
+def test_engine_metrics(vllm_runner, example_prompts):
    max_tokens = 100
    # Use spec decoding to test num_accepted_tokens_per_pos
    speculative_config = {
@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
        "prompt_lookup_min": 3,
        "num_speculative_tokens": 5,
    }
-    monkeypatch.setenv("VLLM_USE_V1", "1")
+
    with vllm_runner(
        MODEL,
        speculative_config=speculative_config,
@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):


@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
-def test_skip_tokenizer_initialization(model: str, monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
+def test_skip_tokenizer_initialization(model: str):
    # This test checks if the flag skip_tokenizer_init skips the initialization
    # of tokenizer and detokenizer. The generated output is expected to contain
    # token ids.
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
    PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
 )
 def test_structured_output(
-    monkeypatch: pytest.MonkeyPatch,
    sample_json_schema: dict[str, Any],
    unsupported_json_schema: dict[str, Any],
    sample_sql_ebnf: str,
@ -115,8 +114,6 @@ def test_structured_output(
    model_name: str,
    speculative_config: dict[str, Any],
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
    if current_platform.is_tpu() and speculative_config:
        pytest.skip("TPU does not support speculative decoding")

@ -620,15 +617,12 @@ Make the response as short as possible.
    ],
 )
 def test_structured_output_with_reasoning_matrices(
-    monkeypatch: pytest.MonkeyPatch,
    backend: str,
    tokenizer_mode: TokenizerMode,
    reasoning_parser: str,
    model_name: str,
    speculative_config: dict[str, Any] | None,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
    if current_platform.is_tpu() and speculative_config:
        pytest.skip("TPU does not support speculative decoding")

@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
 def test_structured_output_auto_mode(
-    monkeypatch: pytest.MonkeyPatch,
    unsupported_json_schema: dict[str, Any],
    model_name: str,
    tokenizer_mode: str,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
    llm = LLM(
        model=model_name,
        max_model_len=1024,
@ -739,9 +730,7 @@ def test_structured_output_auto_mode(


@pytest.mark.skip_global_cleanup
-def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
+def test_guidance_no_additional_properties():
    llm = LLM(
        model="Qwen/Qwen2.5-1.5B-Instruct",
        max_model_len=1024,
@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):

@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
 def test_structured_output_batched_with_non_structured_outputs_requests(
-    monkeypatch: pytest.MonkeyPatch,
    sample_json_schema: dict[str, Any],
    backend: str,
 ):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
    # Don't use eager execution on TPUs because we want to test for no
    # recompilation at runtime
    enforce_eager = bool(not current_platform.is_tpu())
--- a/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh
@ -53,7 +53,6 @@ cleanup() {
 launch_baseline() {
  BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
  VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
  PJRT_DEVICE=TPU \
  VLLM_WORKER_MULTIPROC_METHOD=spawn \
  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
@ -73,7 +72,6 @@ launch_pd() {
  UCX_TLS=tcp \
  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
  VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
  VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
  VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
  PJRT_DEVICE=TPU \
@ -93,7 +91,6 @@ launch_pd() {
  UCX_TLS=tcp \
  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
  VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
  PJRT_DEVICE=TPU \
  VLLM_WORKER_MULTIPROC_METHOD=spawn \
  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
--- a/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh
@ -55,7 +55,6 @@ launch_pd() {
  UCX_TLS=tcp \
  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
  VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
  VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
  VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
  PJRT_DEVICE=TPU \
@ -75,7 +74,6 @@ launch_pd() {
  UCX_TLS=tcp \
  VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
  VLLM_LOGGING_LEVEL=DEBUG \
-  VLLM_USE_V1=1 \
  PJRT_DEVICE=TPU \
  VLLM_WORKER_MULTIPROC_METHOD=spawn \
  VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
--- a/tests/v1/metrics/test_ray_metrics.py
+++ b/tests/v1/metrics/test_ray_metrics.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os

 import pytest
 import ray
@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams
 from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
 from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger

-
-@pytest.fixture(scope="function", autouse=True)
-def use_v1_only(monkeypatch):
-    """
-    The change relies on V1 APIs, so set VLLM_USE_V1=1.
-    """
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
-
 MODELS = [
    "distilbert/distilgpt2",
 ]
@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
    @ray.remote(num_gpus=1)
    class EngineTestActor:
        async def run(self):
-            # Set environment variable inside the Ray actor since environment
-            # variables from pytest fixtures don't propagate to Ray actors
-            os.environ["VLLM_USE_V1"] = "1"
-
            engine_args = AsyncEngineArgs(
                model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True
            )
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
    batch_logprobs_composition: BatchLogprobsComposition,
    temperature: float,
    example_prompts: list[str],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test V1 Engine logprobs & prompt logprobs

@ -308,220 +307,204 @@ def test_get_logprobs_and_prompt_logprobs(
      temperature: "temperature" sampling parameter
      example_prompts: example prompt fixture
    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
-        if do_apc and (
-            temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT
-        ):
-            # Skip some test-cases to save time.
-            pytest.skip()
-        test_prompts = example_prompts
+    do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
+    if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT):
+        # Skip some test-cases to save time.
+        pytest.skip()
+    test_prompts = example_prompts

-        max_tokens = 5
-        hf_outputs = hf_model.generate_greedy(
-            test_prompts,
+    max_tokens = 5
+    hf_outputs = hf_model.generate_greedy(
+        test_prompts,
+        max_tokens=max_tokens,
+    )
+    hf_logprobs = hf_model.generate_greedy_logprobs(
+        test_prompts,
+        max_tokens=max_tokens,
+    )
+
+    # Batch has mixed sample params
+    # (different logprobs/prompt logprobs combos)
+    logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
+
+    # Ensure that each test prompt has a logprob config for testing
+    logprob_prompt_logprob_list = _repeat_logprob_config(
+        test_prompts, logprob_prompt_logprob_list
+    )
+    # Generate SamplingParams
+    vllm_sampling_params = [
+        SamplingParams(
            max_tokens=max_tokens,
+            logprobs=num_lp,
+            prompt_logprobs=num_plp,
+            temperature=temperature,
+            seed=1984,
        )
-        hf_logprobs = hf_model.generate_greedy_logprobs(
-            test_prompts,
+        for num_lp, num_plp in logprob_prompt_logprob_list
+    ]
+    for _ in range(2 if do_apc else 1):
+        _run_and_validate(
+            vllm_model=vllm_model,
+            test_prompts=test_prompts,
+            vllm_sampling_params=vllm_sampling_params,
+            hf_logprobs=hf_logprobs,
+            hf_outputs=hf_outputs,
+            logprob_prompt_logprob_list=logprob_prompt_logprob_list,
+            temperature=temperature,
            max_tokens=max_tokens,
+            do_apc=do_apc,
        )

-        # Batch has mixed sample params
-        # (different logprobs/prompt logprobs combos)
-        logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)

-        # Ensure that each test prompt has a logprob config for testing
-        logprob_prompt_logprob_list = _repeat_logprob_config(
-            test_prompts, logprob_prompt_logprob_list
-        )
-        # Generate SamplingParams
-        vllm_sampling_params = [
-            SamplingParams(
-                max_tokens=max_tokens,
-                logprobs=num_lp,
-                prompt_logprobs=num_plp,
-                temperature=temperature,
-                seed=1984,
-            )
-            for num_lp, num_plp in logprob_prompt_logprob_list
-        ]
-        for _ in range(2 if do_apc else 1):
-            _run_and_validate(
-                vllm_model=vllm_model,
-                test_prompts=test_prompts,
-                vllm_sampling_params=vllm_sampling_params,
-                hf_logprobs=hf_logprobs,
-                hf_outputs=hf_outputs,
-                logprob_prompt_logprob_list=logprob_prompt_logprob_list,
-                temperature=temperature,
-                max_tokens=max_tokens,
-                do_apc=do_apc,
-            )
-
-
-def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
+def test_max_logprobs():
    """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
    Should also fail for `prompt_logprobs > max_logprobs`
    APC should not matter as this test checks basic request validation.
    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    runner = VllmRunner(
+        "facebook/opt-125m",
+        max_logprobs=1,
+        enable_prefix_caching=False,
+        # 2 other llms alive during whole session
+        gpu_memory_utilization=0.15,
+        max_model_len=256,
+    )
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    # should pass
+    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)

-        runner = VllmRunner(
-            "facebook/opt-125m",
-            max_logprobs=1,
-            enable_prefix_caching=False,
-            # 2 other llms alive during whole session
-            gpu_memory_utilization=0.15,
-            max_model_len=256,
-        )
-        vllm_sampling_params = SamplingParams(logprobs=1)
-        # should pass
-        runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
-
-        bad_sampling_params = SamplingParams(logprobs=2)
-        with pytest.raises(ValueError):
-            runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+    bad_sampling_params = SamplingParams(logprobs=2)
+    with pytest.raises(ValueError):
+        runner.generate(["Hello world"], sampling_params=bad_sampling_params)


-def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
+def test_none_logprobs(vllm_model, example_prompts):
    """Engine should return `logprobs` and `prompt_logprobs` as `None`

    Args:
      vllm_model: vLLM model fixture
      example_prompts: list of example prompts (test fixture)
    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        max_tokens = 5
+    max_tokens = 5

-        sampling_params_logprobs_none = SamplingParams(
-            max_tokens=max_tokens,
-            logprobs=None,
-            prompt_logprobs=None,
-            temperature=0.0,
-        )
-        results_logprobs_none = vllm_model.llm.generate(
-            example_prompts,
-            sampling_params=sampling_params_logprobs_none,
-        )
+    sampling_params_logprobs_none = SamplingParams(
+        max_tokens=max_tokens,
+        logprobs=None,
+        prompt_logprobs=None,
+        temperature=0.0,
+    )
+    results_logprobs_none = vllm_model.llm.generate(
+        example_prompts,
+        sampling_params=sampling_params_logprobs_none,
+    )

-        for i in range(len(results_logprobs_none)):
-            # Check sample logprobs are None
-            assert results_logprobs_none[i].outputs[0].logprobs is None
-            assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
-            # Check prompt logprobs are None
-            assert results_logprobs_none[i].prompt_logprobs is None
+    for i in range(len(results_logprobs_none)):
+        # Check sample logprobs are None
+        assert results_logprobs_none[i].outputs[0].logprobs is None
+        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
+        # Check prompt logprobs are None
+        assert results_logprobs_none[i].prompt_logprobs is None


-def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
+def test_zero_logprobs(vllm_model, example_prompts):
    """Engine should return sampled token and prompt token logprobs

    Args:
      vllm_model: vLLM model fixture
      example_prompts: list of example prompts (test fixture)
    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        max_tokens = 5
+    max_tokens = 5

-        sampling_params_logprobs_zero = SamplingParams(
-            max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
-        )
-        results_logprobs_zero = vllm_model.llm.generate(
-            example_prompts, sampling_params=sampling_params_logprobs_zero
-        )
+    sampling_params_logprobs_zero = SamplingParams(
+        max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
+    )
+    results_logprobs_zero = vllm_model.llm.generate(
+        example_prompts, sampling_params=sampling_params_logprobs_zero
+    )

-        for i in range(len(results_logprobs_zero)):
-            # Check that there is one sample logprob dict for each
-            # sample token
-            logprobs = results_logprobs_zero[i].outputs[0].logprobs
-            prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
-            sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
-            prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
-            assert logprobs is not None
-            assert len(sampled_token_ids) == len(logprobs)
-            assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
-            # Check that there is one prompt logprob dict for each
-            # prompt token
-            assert prompt_logprobs is not None
-            assert len(prompt_token_ids) == len(prompt_logprobs)
+    for i in range(len(results_logprobs_zero)):
+        # Check that there is one sample logprob dict for each
+        # sample token
+        logprobs = results_logprobs_zero[i].outputs[0].logprobs
+        prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
+        sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
+        prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
+        assert logprobs is not None
+        assert len(sampled_token_ids) == len(logprobs)
+        assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
+        # Check that there is one prompt logprob dict for each
+        # prompt token
+        assert prompt_logprobs is not None
+        assert len(prompt_token_ids) == len(prompt_logprobs)


-def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
+def test_all_logprobs(example_prompts):
    """Engine should return all vocabulary logprobs and prompt logprobs

    Args:
      example_prompts: list of example prompts (test fixture)
    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        runner = VllmRunner(
-            "facebook/opt-125m",
-            max_logprobs=-1,
-            enable_prefix_caching=False,
-            # 2 other llms alive during whole session
-            gpu_memory_utilization=0.15,
-            max_model_len=256,
-        )
+    runner = VllmRunner(
+        "facebook/opt-125m",
+        max_logprobs=-1,
+        enable_prefix_caching=False,
+        # 2 other llms alive during whole session
+        gpu_memory_utilization=0.15,
+        max_model_len=256,
+    )

-        sampling_params_logprobs_all = SamplingParams(
-            max_tokens=5, logprobs=-1, prompt_logprobs=-1
-        )
-        results_logprobs_all = runner.llm.generate(
-            example_prompts, sampling_params=sampling_params_logprobs_all
-        )
-        vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
+    sampling_params_logprobs_all = SamplingParams(
+        max_tokens=5, logprobs=-1, prompt_logprobs=-1
+    )
+    results_logprobs_all = runner.llm.generate(
+        example_prompts, sampling_params=sampling_params_logprobs_all
+    )
+    vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()

-        for i in range(len(results_logprobs_all)):
-            logprobs = results_logprobs_all[i].outputs[0].logprobs
-            prompt_logprobs = results_logprobs_all[i].prompt_logprobs
-            assert logprobs is not None
-            for logprob in logprobs:
-                assert len(logprob) == vocab_size
-            assert prompt_logprobs is not None
-            assert prompt_logprobs[0] is None
-            for prompt_logprob in prompt_logprobs[1:]:
-                assert len(prompt_logprob) == vocab_size
+    for i in range(len(results_logprobs_all)):
+        logprobs = results_logprobs_all[i].outputs[0].logprobs
+        prompt_logprobs = results_logprobs_all[i].prompt_logprobs
+        assert logprobs is not None
+        for logprob in logprobs:
+            assert len(logprob) == vocab_size
+        assert prompt_logprobs is not None
+        assert prompt_logprobs[0] is None
+        for prompt_logprob in prompt_logprobs[1:]:
+            assert len(prompt_logprob) == vocab_size


@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
-def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch):
+def test_logprobs_mode(logprobs_mode: LogprobsMode):
    """Test with LLM engine with different logprobs_mode.
    For logprobs, we should have non-positive values.
    For logits, we should expect at least one positive values.
    """
    from vllm import LLM

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    llm = LLM(
+        "facebook/opt-125m",
+        max_logprobs=5,
+        enable_prefix_caching=False,
+        # 2 other llms alive during whole session
+        gpu_memory_utilization=0.05,
+        max_model_len=16,
+        logprobs_mode=logprobs_mode,
+    )
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params)

-        llm = LLM(
-            "facebook/opt-125m",
-            max_logprobs=5,
-            enable_prefix_caching=False,
-            # 2 other llms alive during whole session
-            gpu_memory_utilization=0.05,
-            max_model_len=16,
-            logprobs_mode=logprobs_mode,
-        )
-        vllm_sampling_params = SamplingParams(logprobs=1)
-        results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params)
-
-        total_token_with_logprobs = 0
-        positive_values = 0
-        for output in results[0].outputs:
-            for logprobs in output.logprobs:
-                for token_id in logprobs:
-                    logprob = logprobs[token_id]
-                    if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
-                        assert logprob.logprob <= 0
-                    if logprob.logprob > 0:
-                        positive_values = positive_values + 1
-                    total_token_with_logprobs = total_token_with_logprobs + 1
-        assert total_token_with_logprobs >= len(results[0].outputs)
-        if logprobs_mode in ("raw_logits", "processed_logits"):
-            assert positive_values > 0
-        del llm
+    total_token_with_logprobs = 0
+    positive_values = 0
+    for output in results[0].outputs:
+        for logprobs in output.logprobs:
+            for token_id in logprobs:
+                logprob = logprobs[token_id]
+                if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
+                    assert logprob.logprob <= 0
+                if logprob.logprob > 0:
+                    positive_values = positive_values + 1
+                total_token_with_logprobs = total_token_with_logprobs + 1
+    assert total_token_with_logprobs >= len(results[0].outputs)
+    if logprobs_mode in ("raw_logits", "processed_logits"):
+        assert positive_values > 0
+    del llm
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@ -1,14 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os

 import pytest

 from vllm import LLM, SamplingParams

-if os.getenv("VLLM_USE_V1", "0") != "1":
-    pytest.skip("Test package requires V1", allow_module_level=True)
-
 MODEL = "meta-llama/Llama-3.2-1B"
 PROMPT = "Hello my name is Robert and I"

@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
        _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))


-def test_priority(llm):
-    """Check that we reject requests with priority."""
-
-    # Reject all allowed token ids
-    with pytest.raises(ValueError):
-        _ = llm.generate(PROMPT, priority=[1])
-
-
 def test_seed(llm):
    """Check that seed impacts randomness."""

--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@ -38,7 +38,6 @@ def test_eagle_max_len(
    monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
 ):
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)

        if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
@pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
 def test_basic(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    model: str,
    max_tokens: int,
    tensor_parallel_size: int,
@ -55,23 +54,20 @@ def test_basic(
    )
    example_prompts = [prompt]

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    with vllm_runner(
+        model,
+        # Note: max_num_batched_tokens == 1024 is needed here to
+        # actually test chunked prompt
+        max_num_batched_tokens=1024,
+        max_model_len=8192,
+        gpu_memory_utilization=0.7,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    output = vllm_outputs[0][1]

-        with vllm_runner(
-            model,
-            # Note: max_num_batched_tokens == 1024 is needed here to
-            # actually test chunked prompt
-            max_num_batched_tokens=1024,
-            max_model_len=8192,
-            gpu_memory_utilization=0.7,
-            max_num_seqs=max_num_seqs,
-            tensor_parallel_size=tensor_parallel_size,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        output = vllm_outputs[0][1]
-
-        assert "1024" in output or "0, 1" in output
+    assert "1024" in output or "0, 1" in output


@pytest.mark.skip(reason="Temporarily disabled due to timeout")
@ -82,7 +78,6 @@ def test_basic(
@pytest.mark.parametrize("max_num_seqs", [16])
 def test_phi3(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    max_tokens: int,
    max_num_seqs: int,
 ) -> None:
@ -99,18 +94,15 @@ def test_phi3(
    # test head dim = 96
    model = "microsoft/Phi-3-mini-128k-instruct"

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        with vllm_runner(
-            model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
-        # vllm_outputs is a list of tuples whose first element is the token id
-        # and the second element is the output (including the prompt).
-        for output, answer in zip(vllm_outputs, answers):
-            generated_text = output[1]
-            assert answer in generated_text
+    with vllm_runner(
+        model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
+    # vllm_outputs is a list of tuples whose first element is the token id
+    # and the second element is the output (including the prompt).
+    for output, answer in zip(vllm_outputs, answers):
+        generated_text = output[1]
+        assert answer in generated_text


 TP_SIZE_8 = 8
@ -123,7 +115,6 @@ TP_SIZE_8 = 8
 )
 def test_gemma3_27b_with_text_input_and_tp(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    model = "google/gemma-3-27b-it"
    max_tokens = 16
@ -140,21 +131,18 @@ def test_gemma3_27b_with_text_input_and_tp(
        " but in rising every time we fall.",
    ]

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        with vllm_runner(
-            model,
-            max_num_batched_tokens=256,
-            max_num_seqs=max_num_seqs,
-            tensor_parallel_size=tensor_parallel_size,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
-        # vllm_outputs is a list of tuples whose first element is the token id
-        # and the second element is the output (including the prompt).
-        for output, answer in zip(vllm_outputs, answers):
-            generated_text = output[1]
-            assert answer in generated_text
+    with vllm_runner(
+        model,
+        max_num_batched_tokens=256,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
+    # vllm_outputs is a list of tuples whose first element is the token id
+    # and the second element is the output (including the prompt).
+    for output, answer in zip(vllm_outputs, answers):
+        generated_text = output[1]
+        assert answer in generated_text


@pytest.mark.skipif(
@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
 )
 def test_w8a8_quantization(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
    max_tokens = 5
@ -176,18 +163,15 @@ def test_w8a8_quantization(
    )
    example_prompts = [prompt]

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    with vllm_runner(
+        model,
+        max_num_batched_tokens=64,
+        max_model_len=4096,
+        gpu_memory_utilization=0.7,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    output = vllm_outputs[0][1]

-        with vllm_runner(
-            model,
-            max_num_batched_tokens=64,
-            max_model_len=4096,
-            gpu_memory_utilization=0.7,
-            max_num_seqs=max_num_seqs,
-            tensor_parallel_size=tensor_parallel_size,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        output = vllm_outputs[0][1]
-
-        assert "1024" in output or "0, 1" in output
+    assert "1024" in output or "0, 1" in output
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@ -86,7 +86,6 @@ GPU_UTIL = 0.9
@pytest.mark.parametrize("params", TEST_PARAMS)
 def test_perf(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    params: TestParams,
 ) -> None:
    tokenizer = get_tokenizer(
@ -107,48 +106,45 @@ def test_perf(
        )
    )

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    sampling_params = SamplingParams(
+        max_tokens=params.decode_len, temperature=1.0, min_p=0.0
+    )

-        sampling_params = SamplingParams(
-            max_tokens=params.decode_len, temperature=1.0, min_p=0.0
-        )
+    with vllm_runner(
+        params.model,
+        max_num_batched_tokens=MAX_MODEL_LEN,
+        max_model_len=MAX_MODEL_LEN,
+        max_num_seqs=MAX_NUM_SEQS,
+        gpu_memory_utilization=GPU_UTIL,
+        enforce_eager=False,
+        tensor_parallel_size=1,
+    ) as vllm_model:
+        print("  -- Warmup / Compile")
+        for i in range(NUM_WARMUPS):
+            _ = vllm_model.generate(prompts, sampling_params)

-        with vllm_runner(
-            params.model,
-            max_num_batched_tokens=MAX_MODEL_LEN,
-            max_model_len=MAX_MODEL_LEN,
-            max_num_seqs=MAX_NUM_SEQS,
-            gpu_memory_utilization=GPU_UTIL,
-            enforce_eager=False,
-            tensor_parallel_size=1,
-        ) as vllm_model:
-            print("  -- Warmup / Compile")
-            for i in range(NUM_WARMUPS):
-                _ = vllm_model.generate(prompts, sampling_params)
+        print("  -- Benchmarking... ")
+        times = []
+        for i in range(NUM_RUNS):
+            start_time = time.time()
+            _ = vllm_model.generate(prompts, sampling_params)
+            times.append(time.time() - start_time)

-            print("  -- Benchmarking... ")
-            times = []
-            for i in range(NUM_RUNS):
-                start_time = time.time()
-                _ = vllm_model.generate(prompts, sampling_params)
-                times.append(time.time() - start_time)
+        avg_time = sum(times) / len(times)

-            avg_time = sum(times) / len(times)
-
-            print("  -- avg_time = {}".format(avg_time))
-            print(
-                "  -- expected_avg_time = {} with err_tol = {}".format(
-                    params.expected_avg_time, params.err_tol
-                )
+        print("  -- avg_time = {}".format(avg_time))
+        print(
+            "  -- expected_avg_time = {} with err_tol = {}".format(
+                params.expected_avg_time, params.err_tol
+            )
+        )
+        diff = avg_time - params.expected_avg_time
+        ok = diff < params.err_tol
+        if diff < -params.err_tol:
+            print(
+                "  !! WARNING !! Performance has improved by {}, "
+                "it may be necessary to fine-tune the "
+                "expected_avg_time = {}".format(-diff, params.expected_avg_time)
            )
-            diff = avg_time - params.expected_avg_time
-            ok = diff < params.err_tol
-            if diff < -params.err_tol:
-                print(
-                    "  !! WARNING !! Performance has improved by {}, "
-                    "it may be necessary to fine-tune the "
-                    "expected_avg_time = {}".format(-diff, params.expected_avg_time)
-                )

-            assert ok, " !! ERROR !! Regression detected"
+        assert ok, " !! ERROR !! Regression detected"
--- a/tests/v1/tracing/test_tracing.py
+++ b/tests/v1/tracing/test_tracing.py
@ -82,7 +82,7 @@ def test_traces(
 ):
    with monkeypatch.context() as m:
        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-        m.setenv("VLLM_USE_V1", "1")
+
        sampling_params = SamplingParams(
            temperature=0.01,
            top_p=0.1,
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner):
        logger.info("Warming up model for the compilation...")
        # Only generate graph for the generic shape
        with _set_global_compilation_settings(self.vllm_config):
-            self._dummy_run(max(16, self.max_num_reqs))
+            self._dummy_run(
+                min(
+                    max(16, self.max_num_reqs),
+                    self.scheduler_config.max_num_batched_tokens,
+                )
+            )
+
        logger.info("Warming up done.")

    def _init_device_properties(self) -> None: