From a73e183e36a818ea95f442ae1751bc66cf4f135d Mon Sep 17 00:00:00 2001
From: Sibi <85477603+t-sibiraj@users.noreply.github.com>
Date: Mon, 17 Mar 2025 11:35:57 +0800
Subject: [PATCH] [Misc] Replace os environ to monkeypatch in test suite
 (#14516)

Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com>
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
---
 .buildkite/test-pipeline.yaml                 |   2 +-
 .../test_basic_correctness.py                 | 105 +++---
 .../basic_correctness/test_chunked_prefill.py | 168 +++++-----
 tests/basic_correctness/test_cumem.py         |  62 ++--
 tests/compile/test_basic_correctness.py       | 207 ++++++------
 tests/compile/test_full_graph.py              | 115 ++++++-
 tests/compile/utils.py                        |  93 ------
 tests/conftest.py                             |   2 +-
 tests/distributed/test_comm_ops.py            |  85 +++--
 tests/distributed/test_custom_all_reduce.py   | 173 +++++-----
 tests/distributed/test_pipeline_partition.py  |  60 ++--
 tests/distributed/test_pp_cudagraph.py        |  38 ++-
 tests/entrypoints/llm/test_accuracy.py        |   4 +-
 .../offline_mode/test_offline_mode.py         |  49 +--
 .../openai/correctness/test_lmeval.py         |   5 +-
 tests/kernels/test_attention_selector.py      | 129 +++++---
 tests/kernels/test_awq.py                     |  60 ++--
 tests/kernels/test_rocm_attention_selector.py |  18 +-
 tests/kernels/utils.py                        |  64 ++--
 .../{disagg_test.py => test_disagg.py}        |   0
 .../{module_test.py => test_module.py}        |   0
 .../models/decoder_only/language/test_fp8.py  | 120 +++----
 .../models/embedding/language/test_gritlm.py  |  96 +++---
 tests/models/test_oot_registration.py         | 130 ++++----
 tests/mq_llm_engine/test_error_handling.py    |  31 +-
 .../multi_step/test_correctness_async_llm.py  | 202 ++++++------
 tests/multi_step/test_correctness_llm.py      | 299 ++++++++---------
 tests/neuron/1_core/test_block_table.py       |  80 ++---
 tests/neuron/1_core/test_prefix_prefill.py    | 306 +++++++++---------
 tests/plugins_tests/test_platform_plugins.py  |  13 +-
 tests/plugins_tests/test_scheduler_plugins.py |  62 ++--
 tests/prefix_caching/test_prefix_caching.py   | 111 ++++---
 tests/test_regression.py                      |  16 +-
 tests/test_utils.py                           |  63 ++--
 tests/tpu/test_custom_dispatcher.py           |  25 +-
 tests/tracing/test_tracing.py                 | 277 ++++++++--------
 tests/utils.py                                |  11 +-
 tests/v1/e2e/test_ngram_spec_decode.py        |  11 +-
 tests/v1/engine/test_async_llm.py             |  11 +-
 tests/v1/engine/test_engine_core.py           |  10 +-
 tests/v1/engine/test_engine_core_client.py    |   5 +-
 tests/v1/sample/test_logprobs.py              | 224 +++++++------
 tests/v1/tpu/test_basic.py                    |  16 +-
 43 files changed, 1900 insertions(+), 1658 deletions(-)
 delete mode 100644 tests/compile/utils.py
 rename tests/kv_transfer/{disagg_test.py => test_disagg.py} (100%)
 rename tests/kv_transfer/{module_test.py => test_module.py} (100%)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a6616d7b41480..f85572e7c234c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -522,7 +522,7 @@ steps:
   # TODO: investigate and fix
   # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
 
 - label: Plugin Tests (2 GPUs) # 40min
   working_dir: "/vllm-workspace/tests"
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 0cb3b739b7245..1458f0893a93c 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -47,6 +47,7 @@ def test_vllm_gc_ed():
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False])
 def test_models(
+    monkeypatch: pytest.MonkeyPatch,
     hf_runner,
     model: str,
     backend: str,
@@ -63,31 +64,33 @@ def test_models(
         pytest.skip(
             f"{backend} does not support gemma2 with full context length.")
 
-    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", backend)
 
-    # 5042 tokens for gemma2
-    # gemma2 has alternating sliding window size of 4096
-    # we need a prompt with more than 4096 tokens to test the sliding window
-    prompt = "The following numbers of the sequence " + ", ".join(
-        str(i) for i in range(1024)) + " are:"
-    example_prompts = [prompt]
+        # 5042 tokens for gemma2
+        # gemma2 has alternating sliding window size of 4096
+        # we need a prompt with more than 4096 tokens to test the sliding window
+        prompt = "The following numbers of the sequence " + ", ".join(
+            str(i) for i in range(1024)) + " are:"
+        example_prompts = [prompt]
 
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    with VllmRunner(model,
-                    max_model_len=8192,
-                    dtype=dtype,
-                    enforce_eager=enforce_eager,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        with VllmRunner(model,
+                        max_model_len=8192,
+                        dtype=dtype,
+                        enforce_eager=enforce_eager,
+                        gpu_memory_utilization=0.7) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
 
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
 
 
 @multi_gpu_test(num_gpus=2)
@@ -104,6 +107,7 @@ def test_models(
         ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
     ])
 def test_models_distributed(
+    monkeypatch: pytest.MonkeyPatch,
     hf_runner,
     vllm_runner,
     example_prompts,
@@ -116,34 +120,41 @@ def test_models_distributed(
     if test_suite != TARGET_TEST_SUITE:
         pytest.skip(f"Skip test for {test_suite}")
 
-    if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-        # test Ray Compiled Graph
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+    with monkeypatch.context() as monkeypatch_context:
+        if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+            # test Ray Compiled Graph
+            monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
+            monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
 
-    if attention_backend:
-        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
+        if attention_backend:
+            monkeypatch_context.setenv(
+                "VLLM_ATTENTION_BACKEND",
+                attention_backend,
+            )
 
-    dtype = "half"
-    max_tokens = 5
+        dtype = "half"
+        max_tokens = 5
 
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=2,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        # NOTE: take care of the order. run vLLM first, and then run HF.
+        # vLLM needs a fresh new process without cuda initialization.
+        # if we run HF first, the cuda initialization will be done and it
+        # will hurt multiprocessing backend with fork method
+        # (the default method).
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                tensor_parallel_size=2,
+                distributed_executor_backend=distributed_executor_backend,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
 
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index be007de321c8a..06c9e25ed8dd8 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -7,16 +7,22 @@ prefill requests are chunked.
 
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
-import os
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
 
 import pytest
 
-from tests.kernels.utils import override_backend_env_variable
 from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR
 
 from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import multi_gpu_test
 
+if TYPE_CHECKING:
+    from .conftest import HfRunner, VllmRunner
+
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-3.2-1B-Instruct",
@@ -24,12 +30,14 @@ MODELS = [
 
 
 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
     """
     Since this module is V0 only, set VLLM_USE_V1=0 for
     all tests in the file.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -42,8 +50,8 @@ def use_v0_only(monkeypatch):
 @pytest.mark.parametrize("tensor_parallel_size", [1])
 @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models(
-    hf_runner,
-    vllm_runner,
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
     example_prompts,
     model: str,
     dtype: str,
@@ -52,37 +60,39 @@ def test_models(
     enforce_eager: bool,
     tensor_parallel_size: int,
     attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """
     Checks exact match decode between huggingface model and vllm runner with
     chunked prefill.
     """
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
 
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
+        max_num_seqs = chunked_prefill_token_size
+        max_num_batched_tokens = chunked_prefill_token_size
 
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=True,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                max_num_batched_tokens=max_num_batched_tokens,
+                enable_chunked_prefill=True,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                max_num_seqs=max_num_seqs,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
 
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
 
 
 @multi_gpu_test(num_gpus=2)
@@ -90,57 +100,61 @@ def test_models(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models_distributed(
-    hf_runner,
-    vllm_runner,
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
     example_prompts,
     model: str,
     distributed_executor_backend: str,
     attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+        if (model == "meta-llama/Llama-3.2-1B-Instruct"
+                and distributed_executor_backend == "ray"):
+            # test Ray Compiled Graph
+            m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
+            m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
 
-    if (model == "meta-llama/Llama-3.2-1B-Instruct"
-            and distributed_executor_backend == "ray"):
-        # test Ray Compiled Graph
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+        dtype = "half"
+        max_tokens = 5
+        chunked_prefill_token_size = 16
 
-    dtype = "half"
-    max_tokens = 5
-    chunked_prefill_token_size = 16
+        # Add a chunked prefill config.
+        max_num_seqs = min(chunked_prefill_token_size, 256)
+        assert chunked_prefill_token_size != -1
+        enable_chunked_prefill = True
+        max_num_batched_tokens = chunked_prefill_token_size
 
-    # Add a chunked prefill config.
-    max_num_seqs = min(chunked_prefill_token_size, 256)
-    assert chunked_prefill_token_size != -1
-    enable_chunked_prefill = True
-    max_num_batched_tokens = chunked_prefill_token_size
+        # NOTE: take care of the order. run vLLM first, and then run HF.
+        # vLLM needs a fresh new process without cuda initialization.
+        # if we run HF first, the cuda initialization will be done and it
+        # will hurt multiprocessing backend with
+        # fork method (the default method).
 
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                tensor_parallel_size=2,
+                max_num_seqs=max_num_seqs,
+                enable_chunked_prefill=enable_chunked_prefill,
+                max_num_batched_tokens=max_num_batched_tokens,
+                distributed_executor_backend=distributed_executor_backend,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(
+                example_prompts,
+                max_tokens,
+            )
 
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=2,
-            max_num_seqs=max_num_seqs,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
 
 
 @pytest.mark.parametrize(
@@ -158,7 +172,7 @@ def test_models_distributed(
 # the async postprocessor
 @pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_models_with_fp8_kv_cache(
-    vllm_runner,
+    vllm_runner: VllmRunner,
     example_prompts,
     kv_cache_dtype: str,
     model: str,
@@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache(
 @pytest.mark.parametrize("tensor_parallel_size", [1])
 @pytest.mark.parametrize("dtype", ["half"])
 def test_with_prefix_caching(
-    vllm_runner,
+    vllm_runner: VllmRunner,
     max_tokens: int,
     enforce_eager: bool,
     chunk_size: int,
@@ -254,8 +268,10 @@ def test_with_prefix_caching(
         ) as vllm_model:
             outputs[enable] = []
             for prompt in full_prompts:
-                outputs[enable] += vllm_model.generate_greedy([prompt],
-                                                              max_tokens)
+                outputs[enable] += vllm_model.generate_greedy(
+                    [prompt],
+                    max_tokens,
+                )
 
     check_outputs_equal(
         outputs_0_lst=outputs[False],
@@ -274,8 +290,8 @@ def test_with_prefix_caching(
 @pytest.mark.cpu_model
 @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_models_cpu(
-    hf_runner,
-    vllm_runner,
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
     example_prompts,
     model: str,
     dtype: str,
@@ -283,7 +299,7 @@ def test_models_cpu(
     chunked_prefill_token_size: int,
     enforce_eager: bool,
     attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     test_models(
         hf_runner,
@@ -307,7 +323,7 @@ def test_models_cpu(
 @pytest.mark.cpu_model
 @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_with_prefix_caching_cpu(
-    vllm_runner,
+    vllm_runner: VllmRunner,
     max_tokens: int,
     enforce_eager: bool,
     chunk_size: int,
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index ba81f2bb79d11..f5ee469fb00a9 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -123,40 +123,38 @@ def test_cumem_with_cudagraph():
         # sleep mode with pytorch checkpoint
         ("facebook/opt-125m", False),
     ])
-def test_end_to_end(model: str, use_v1: bool):
-    import os
-    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
-    free, total = torch.cuda.mem_get_info()
-    used_bytes_baseline = total - free  # in case other process is running
-    llm = LLM(model, enable_sleep_mode=True)
-    prompt = "How are you?"
-    sampling_params = SamplingParams(temperature=0, max_tokens=10)
-    output = llm.generate(prompt, sampling_params)
+def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        free, total = torch.cuda.mem_get_info()
+        used_bytes_baseline = total - free  # in case other process is running
+        llm = LLM(model, enable_sleep_mode=True)
+        prompt = "How are you?"
+        sampling_params = SamplingParams(temperature=0, max_tokens=10)
+        output = llm.generate(prompt, sampling_params)
 
-    # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
-    # which is difficult to measure in the test. therefore, we only
-    # test sleep level 1 here.
-    llm.sleep(level=1)
+        # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
+        # which is difficult to measure in the test. therefore, we only
+        # test sleep level 1 here.
+        llm.sleep(level=1)
 
-    free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
-    used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
-    # now the memory usage is mostly cudagraph memory pool,
-    # and it should be less than the model weights (1B model, 2GiB weights)
+        free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+        used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+        # now the memory usage is mostly cudagraph memory pool,
+        # and it should be less than the model weights (1B model, 2GiB weights)
 
-    # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
-    # is captured but cannot be releasesd from PyTorch due to a known bug,
-    # therefore high memory usage after `llm.sleep` is called is expected.
-    # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
-    # in V1.
-    if use_v1:
-        assert used_bytes < 7 * GiB_bytes
-    else:
-        assert used_bytes < 2 * GiB_bytes
+        # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
+        # is captured but cannot be releasesd from PyTorch due to a known bug,
+        # therefore high memory usage after `llm.sleep` is called is expected.
+        # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
+        # in V1.
+        if use_v1:
+            assert used_bytes < 7 * GiB_bytes
+        else:
+            assert used_bytes < 2 * GiB_bytes
 
-    llm.wake_up()
-    output2 = llm.generate(prompt, sampling_params)
+        llm.wake_up()
+        output2 = llm.generate(prompt, sampling_params)
 
-    # cmp output
-    assert output[0].outputs[0].text == output2[0].outputs[0].text
-
-    del os.environ["VLLM_USE_V1"]
+        # cmp output
+        assert output[0].outputs[0].text == output2[0].outputs[0].text
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 48323b21a8c42..b639fd719ca0a 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
 
 import dataclasses
-from typing import Optional
 
 import pytest
 
@@ -22,75 +22,76 @@ class TestSetting:
     fullgraph: bool
 
 
-# representative settings for testing
-test_settings = [
-    # basic llama model
-    TestSetting(
-        model="meta-llama/Llama-3.2-1B-Instruct",
-        model_args=[],
-        pp_size=2,
-        tp_size=2,
-        attn_backend="FLASHINFER",
-        method="generate",
-        fullgraph=True,
-    ),
-    # llama model with quantization
-    TestSetting(
-        model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-        model_args=["--quantization", "gptq"],
-        pp_size=1,
-        tp_size=1,
-        attn_backend="FLASH_ATTN",
-        method="generate",
-        fullgraph=True,
-    ),
-    # MoE model
-    TestSetting(
-        model="ibm/PowerMoE-3b",
-        model_args=[],
-        pp_size=1,
-        tp_size=2,
-        attn_backend="FLASH_ATTN",
-        method="generate",
-        fullgraph=True,
-    ),
-    # embedding model
-    TestSetting(
-        model="BAAI/bge-multilingual-gemma2",
-        model_args=["--task", "embed"],
-        pp_size=1,
-        tp_size=1,
-        attn_backend="FLASH_ATTN",
-        method="encode",
-        fullgraph=True,
-    ),
-    # encoder-based embedding model (BERT)
-    TestSetting(
-        model="BAAI/bge-base-en-v1.5",
-        model_args=["--task", "embed"],
-        pp_size=1,
-        tp_size=1,
-        attn_backend="XFORMERS",
-        method="encode",
-        fullgraph=True,
-    ),
-    # vision language model
-    TestSetting(
-        model="microsoft/Phi-3.5-vision-instruct",
-        model_args=["--trust-remote-code", "--max-model-len", "2048"],
-        pp_size=2,
-        tp_size=1,
-        attn_backend="FLASH_ATTN",
-        method="generate_with_image",
-        fullgraph=False,
-    ),
-]
-
-
 # we cannot afford testing the full Catesian product
 # of all models and all levels
-@pytest.mark.parametrize("test_setting", test_settings)
-def test_compile_correctness(test_setting: TestSetting):
+@pytest.mark.parametrize(
+    "test_setting",
+    [
+        # basic llama model
+        TestSetting(
+            model="meta-llama/Llama-3.2-1B-Instruct",
+            model_args=[],
+            pp_size=2,
+            tp_size=2,
+            attn_backend="FLASHINFER",
+            method="generate",
+            fullgraph=True,
+        ),
+        # llama model with quantization
+        TestSetting(
+            model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            model_args=["--quantization", "gptq"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="generate",
+            fullgraph=True,
+        ),
+        # MoE model
+        TestSetting(
+            model="ibm/PowerMoE-3b",
+            model_args=[],
+            pp_size=1,
+            tp_size=2,
+            attn_backend="FLASH_ATTN",
+            method="generate",
+            fullgraph=True,
+        ),
+        # embedding model
+        TestSetting(
+            model="BAAI/bge-multilingual-gemma2",
+            model_args=["--task", "embed"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="encode",
+            fullgraph=True,
+        ),
+        # encoder-based embedding model (BERT)
+        TestSetting(
+            model="BAAI/bge-base-en-v1.5",
+            model_args=["--task", "embed"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="XFORMERS",
+            method="encode",
+            fullgraph=True,
+        ),
+        # vision language model
+        TestSetting(
+            model="microsoft/Phi-3.5-vision-instruct",
+            model_args=["--trust-remote-code", "--max-model-len", "2048"],
+            pp_size=2,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="generate_with_image",
+            fullgraph=False,
+        ),
+    ])
+def test_compile_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_setting: TestSetting,
+):
     # this test is run under multiple suits, with different GPUs.
     # make sure we only run the test with correct CUDA devices.
     # don't use "<", as it will duplicate the tests.
@@ -103,41 +104,45 @@ def test_compile_correctness(test_setting: TestSetting):
     fullgraph = test_setting.fullgraph
     if cuda_device_count_stateless() != pp_size * tp_size:
         pytest.skip("Not correct CUDA devices for the test.")
-    import os
-    os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
-    final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
-                ["-tp", str(tp_size)]
 
-    all_args: list[list[str]] = []
-    all_envs: list[Optional[dict[str, str]]] = []
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+        final_args = [
+            "--enforce-eager", *model_args, "-pp",
+            str(pp_size), "-tp",
+            str(tp_size)
+        ]
 
-    for level in [
-            CompilationLevel.NO_COMPILATION,
-            CompilationLevel.PIECEWISE,
-    ]:
-        all_args.append(final_args + [f"-O{level}"])
-        all_envs.append({})
+        all_args: list[list[str]] = []
+        all_envs: list[dict[str, str] | None] = []
 
-    # inductor will change the output, so we only compare if the output
-    # is close, not exactly the same.
-    compare_all_settings(
-        model,
-        all_args,
-        all_envs,
-        method=method if method != "generate" else "generate_close")
-    all_envs.clear()
-    all_args.clear()
+        for level in [
+                CompilationLevel.NO_COMPILATION,
+                CompilationLevel.PIECEWISE,
+        ]:
+            all_args.append(final_args + [f"-O{level}"])
+            all_envs.append({})
 
-    for level in [
-            CompilationLevel.NO_COMPILATION,
-            CompilationLevel.DYNAMO_AS_IS,
-            CompilationLevel.DYNAMO_ONCE,
-    ]:
-        all_args.append(final_args + [f"-O{level}"])
-        all_envs.append({})
-        if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
-            # "DYNAMO_ONCE" will always use fullgraph
-            all_envs[-1][
-                "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
+        # inductor will change the output, so we only compare if the output
+        # is close, not exactly the same.
+        compare_all_settings(
+            model,
+            all_args,
+            all_envs,
+            method=method if method != "generate" else "generate_close")
+        all_envs.clear()
+        all_args.clear()
 
-    compare_all_settings(model, all_args * 3, all_envs, method=method)
+        for level in [
+                CompilationLevel.NO_COMPILATION,
+                CompilationLevel.DYNAMO_AS_IS,
+                CompilationLevel.DYNAMO_ONCE,
+        ]:
+            all_args.append(final_args + [f"-O{level}"])
+            all_envs.append({})
+            if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
+                # "DYNAMO_ONCE" will always use fullgraph
+                all_envs[-1][
+                    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
+
+        compare_all_settings(model, all_args * 3, all_envs, method=method)
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 6e83fa36881e4..cf463f3e75254 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -1,22 +1,115 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import pytest
+from __future__ import annotations
 
+from typing import Any
+
+import pytest
+import torch
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
 from vllm.config import CompilationLevel
+from vllm.platforms import current_platform
 
 from ..utils import fork_new_process_for_each_test
-from .utils import TEST_MODELS, check_full_graph_support
 
 
-@pytest.mark.parametrize("model_info", TEST_MODELS)
+@pytest.fixture(params=None, name="model_info")
+def models_list_fixture(request):
+    TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
+        ("facebook/opt-125m", {}),
+        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
+            "dtype": torch.float16,
+            "quantization": "compressed-tensors"
+        }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
+            "dtype": torch.float16,
+            "quantization": "compressed-tensors"
+        }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
+            "quantization": "compressed-tensors"
+        }),
+        ("meta-llama/Llama-3.2-1B-Instruct", {}),
+    ]
+
+    if is_quant_method_supported("aqlm"):
+        TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
+            "quantization": "aqlm"
+        }))
+
+    # TODO: figure out why this fails.
+    if False and is_quant_method_supported("gguf"):  # noqa: SIM223
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
+            "quantization": "gguf"
+        }))
+
+    if is_quant_method_supported("gptq"):
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
+            "quantization": "gptq"
+        }))
+
+    if is_quant_method_supported("gptq_marlin"):
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
+            "quantization": "gptq_marlin"
+        }))
+
+    if is_quant_method_supported("gptq_marlin_24"):
+        TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
+            "quantization": "gptq_marlin_24"
+        }))
+
+    if is_quant_method_supported("marlin"):
+        TEST_MODELS.append(
+            ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
+                "quantization": "marlin"
+            }))
+
+    if not current_platform.is_rocm() and is_quant_method_supported("awq"):
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
+            "quantization": "AWQ"
+        }))
+
+    return TEST_MODELS
+
+
 @pytest.mark.parametrize(
     "optimization_level",
-    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
+    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
+)
+@pytest.mark.parametrize("model_info", "", indirect=True)
 @fork_new_process_for_each_test
-def test_full_graph(model_info, optimization_level):
-    model = model_info[0]
-    model_kwargs = model_info[1]
-    check_full_graph_support(model,
-                             model_kwargs,
-                             optimization_level,
-                             tp_size=1)
+def test_full_graph(
+    monkeypatch: pytest.MonkeyPatch,
+    model_info: tuple[str, dict[str, Any]],
+    optimization_level: int,
+):
+    model, model_kwargs = model_info
+
+    with monkeypatch.context() as m:
+        # make sure these models can be captured in full graph mode
+        m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
+        print(f"MODEL={model}")
+
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        sampling_params = SamplingParams(temperature=0)
+        llm = LLM(
+            model=model,
+            enforce_eager=True,
+            tensor_parallel_size=1,
+            disable_custom_all_reduce=True,
+            compilation_config=optimization_level,
+            **model_kwargs,
+        )
+        outputs = llm.generate(prompts, sampling_params)
+
+        # Print the outputs.
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/compile/utils.py b/tests/compile/utils.py
deleted file mode 100644
index fb8270c26b1b0..0000000000000
--- a/tests/compile/utils.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-import torch
-
-from tests.quantization.utils import is_quant_method_supported
-from vllm import LLM, SamplingParams
-from vllm.platforms import current_platform
-
-TEST_MODELS = [
-    ("facebook/opt-125m", {}),
-    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
-        "dtype": torch.float16,
-        "quantization": "compressed-tensors"
-    }),
-    ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
-        "dtype": torch.float16,
-        "quantization": "compressed-tensors"
-    }),
-    ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
-        "quantization": "compressed-tensors"
-    }),
-    ("meta-llama/Llama-3.2-1B-Instruct", {}),
-]
-
-if is_quant_method_supported("aqlm"):
-    TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
-        "quantization": "aqlm"
-    }))
-
-# TODO: figure out why this fails.
-if False and is_quant_method_supported("gguf"):  # noqa: SIM223
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
-        "quantization": "gguf"
-    }))
-
-if is_quant_method_supported("gptq"):
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
-        "quantization": "gptq"
-    }))
-
-if is_quant_method_supported("gptq_marlin"):
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
-        "quantization": "gptq_marlin"
-    }))
-
-if is_quant_method_supported("gptq_marlin_24"):
-    TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
-        "quantization": "gptq_marlin_24"
-    }))
-
-if is_quant_method_supported("marlin"):
-    TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
-        "quantization": "marlin"
-    }))
-
-if not current_platform.is_rocm() and is_quant_method_supported("awq"):
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
-        "quantization": "AWQ"
-    }))
-
-
-def check_full_graph_support(model,
-                             model_kwargs,
-                             optimization_level,
-                             tp_size=1):
-    # make sure these models can be captured in full graph mode
-    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
-
-    print(f"MODEL={model}")
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model=model,
-              enforce_eager=True,
-              tensor_parallel_size=tp_size,
-              disable_custom_all_reduce=True,
-              compilation_config=optimization_level,
-              **model_kwargs)
-
-    outputs = llm.generate(prompts, sampling_params)
-
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/conftest.py b/tests/conftest.py
index 41c0e62ce14f3..30e5ca2eb137a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1110,4 +1110,4 @@ def pytest_collection_modifyitems(config, items):
     skip_optional = pytest.mark.skip(reason="need --optional option to run")
     for item in items:
         if "optional" in item.keywords:
-            item.add_marker(skip_optional)
+            item.add_marker(skip_optional)
\ No newline at end of file
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index 7b0346b8ab50f..ac6d6aae30063 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -3,7 +3,10 @@
 
 Run `pytest tests/distributed/test_comm_ops.py`.
 """
-import os
+
+from __future__ import annotations
+
+from typing import Any, Callable
 
 import pytest
 import ray
@@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
-                           distributed_init_port: str):
+def all_reduce_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
-                           distributed_init_port: str):
+def all_gather_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
-                                      distributed_init_port: str):
+def broadcast_tensor_dict_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
-                                      distributed_init_port: str):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+def send_recv_tensor_dict_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
-                          distributed_init_port: str):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+def send_recv_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
     all_reduce_test_worker, all_gather_test_worker,
     broadcast_tensor_dict_test_worker
 ])
-def test_multi_process_tensor_parallel(tp_size, test_target):
-    multi_process_parallel(tp_size, 1, test_target)
+def test_multi_process_tensor_parallel(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    test_target: Callable[..., Any],
+):
+    multi_process_parallel(monkeypatch, tp_size, 1, test_target)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
 @pytest.mark.parametrize("pp_size", [2])
 @pytest.mark.parametrize(
     "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
-def test_multi_process_pipeline_parallel(pp_size, test_target):
-    multi_process_parallel(1, pp_size, test_target)
+def test_multi_process_pipeline_parallel(
+    monkeypatch: pytest.MonkeyPatch,
+    pp_size: int,
+    test_target: Callable[..., Any],
+):
+    multi_process_parallel(monkeypatch, 1, pp_size, test_target)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
@@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target):
     broadcast_tensor_dict_test_worker
 ])
 def test_multi_process_tensor_parallel_pipeline_parallel(
-        tp_size, pp_size, test_target):
-    multi_process_parallel(tp_size, pp_size, test_target)
+    tp_size: int,
+    pp_size: int,
+    test_target: Callable[..., Any],
+    monkeypatch: pytest.MonkeyPatch,
+):
+    multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 4928690bebb07..bfa7d06c4d075 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import os
 import random
 
 import pytest
@@ -23,95 +22,115 @@ for i, v in enumerate(test_sizes):
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
-    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
-    ensure_model_parallel_initialized(tp_size, pp_size)
-    group = get_tensor_model_parallel_group().device_group
+def graph_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pp_size,
+    rank,
+    distributed_init_port,
+):
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        init_test_distributed_environment(tp_size, pp_size, rank,
+                                          distributed_init_port)
+        ensure_model_parallel_initialized(tp_size, pp_size)
+        group = get_tensor_model_parallel_group().device_group
 
-    # A small all_reduce for warmup.
-    # this is needed because device communicators might be created lazily
-    # (e.g. NCCL). This will ensure that the communicator is initialized
-    # before any communication happens, so that this group can be used for
-    # graph capture immediately.
-    data = torch.zeros(1)
-    data = data.to(device=device)
-    torch.distributed.all_reduce(data, group=group)
-    torch.cuda.synchronize()
-    del data
+        # A small all_reduce for warmup.
+        # this is needed because device communicators might be created lazily
+        # (e.g. NCCL). This will ensure that the communicator is initialized
+        # before any communication happens, so that this group can be used for
+        # graph capture immediately.
+        data = torch.zeros(1)
+        data = data.to(device=device)
+        torch.distributed.all_reduce(data, group=group)
+        torch.cuda.synchronize()
+        del data
 
-    # we use the first group to communicate once
-    # and the second group to communicate twice
-    # and so on
-    # this is used to demonstrate that each group can
-    # communicate independently
-    num_communication = rank // tp_size + 1
+        # we use the first group to communicate once
+        # and the second group to communicate twice
+        # and so on
+        # this is used to demonstrate that each group can
+        # communicate independently
+        num_communication = rank // tp_size + 1
 
-    for sz in test_sizes:
-        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-            with graph_capture(device=device) as graph_capture_context:
-                # use integers so result matches NCCL exactly
-                inp1 = torch.randint(1,
-                                     16, (sz, ),
-                                     dtype=dtype,
-                                     device=torch.cuda.current_device())
-                inp2 = torch.randint(1,
-                                     16, (sz, ),
-                                     dtype=dtype,
-                                     device=torch.cuda.current_device())
-                torch.cuda.synchronize()
-                graph = torch.cuda.CUDAGraph()
-                with torch.cuda.graph(graph,
-                                      stream=graph_capture_context.stream):
-                    for i in range(num_communication):
-                        out1 = tensor_model_parallel_all_reduce(inp1)
-                        # the input buffer is immediately modified to test
-                        # synchronization
-                        dist.all_reduce(inp1, group=group)
-                        out2 = tensor_model_parallel_all_reduce(inp2)
-                        dist.all_reduce(inp2, group=group)
-            graph.replay()
-            torch.testing.assert_close(out1, inp1)
-            torch.testing.assert_close(out2, inp2)
+        for sz in test_sizes:
+            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                with graph_capture(device=device) as graph_capture_context:
+                    # use integers so result matches NCCL exactly
+                    inp1 = torch.randint(1,
+                                         16, (sz, ),
+                                         dtype=dtype,
+                                         device=torch.cuda.current_device())
+                    inp2 = torch.randint(1,
+                                         16, (sz, ),
+                                         dtype=dtype,
+                                         device=torch.cuda.current_device())
+                    torch.cuda.synchronize()
+                    graph = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(graph,
+                                          stream=graph_capture_context.stream):
+                        for i in range(num_communication):
+                            out1 = tensor_model_parallel_all_reduce(inp1)
+                            # the input buffer is immediately modified to test
+                            # synchronization
+                            dist.all_reduce(inp1, group=group)
+                            out2 = tensor_model_parallel_all_reduce(inp2)
+                            dist.all_reduce(inp2, group=group)
+                graph.replay()
+                torch.testing.assert_close(out1, inp1)
+                torch.testing.assert_close(out2, inp2)
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
-    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+def eager_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pp_size,
+    rank,
+    distributed_init_port,
+):
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        init_test_distributed_environment(tp_size, pp_size, rank,
+                                          distributed_init_port)
 
-    # we use the first group to communicate once
-    # and the second group to communicate twice
-    # and so on
-    # this is used to demonstrate that each group can
-    # communicate independently
-    num_communication = rank // tp_size + 1
-    sz = 1024
-    fa = get_tp_group().ca_comm
-    inp = torch.ones(sz, dtype=torch.float32, device=device)
-    out = inp
-    for _ in range(num_communication):
-        out = fa.all_reduce(out, registered=False)
-    torch.testing.assert_close(out, inp * (tp_size**num_communication))
+        # we use the first group to communicate once
+        # and the second group to communicate twice
+        # and so on
+        # this is used to demonstrate that each group can
+        # communicate independently
+        num_communication = rank // tp_size + 1
+        sz = 1024
+        fa = get_tp_group().ca_comm
+        inp = torch.ones(sz, dtype=torch.float32, device=device)
+        out = inp
+        for _ in range(num_communication):
+            out = fa.all_reduce(out, registered=False)
+        torch.testing.assert_close(out, inp * (tp_size**num_communication))
 
-    inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
-    out = inp
-    for _ in range(num_communication):
-        out = fa.all_reduce(out, registered=False)
-    torch.testing.assert_close(out, inp * (tp_size**num_communication))
+        inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
+        out = inp
+        for _ in range(num_communication):
+            out = fa.all_reduce(out, registered=False)
+        torch.testing.assert_close(out, inp * (tp_size**num_communication))
 
 
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
 @pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
-def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
+def test_custom_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pipeline_parallel_size,
+    test_target,
+):
     world_size = tp_size * pipeline_parallel_size
     if world_size > torch.cuda.device_count():
         pytest.skip("Not enough GPUs to run the test.")
-    multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
+                           test_target)
diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py
index 18c5be29c5ce1..7bf93f270148b 100644
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@@ -7,33 +7,35 @@ import pytest
 from vllm.distributed.utils import get_pp_indices
 
 
-def test_custom_layer_partition():
+def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
 
-    def _verify(partition_str, num_layers, pp_size, goldens):
-        bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
-        os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
-        for pp_rank, golden in enumerate(goldens):
-            assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
-        if bak is not None:
-            os.environ["VLLM_PP_LAYER_PARTITION"] = bak
+    with monkeypatch.context() as m:
 
-    # Even partition
-    _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-    # Balanced partition
-    _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
-    # Put reminder somewhere
-    _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
-    # Invalid partition strings
-    with pytest.raises(ValueError):
-        _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-    with pytest.raises(ValueError):
-        _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-    # Wrong number of partitions
-    with pytest.raises(ValueError):
-        _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-    # Wrong number of layers
-    with pytest.raises(ValueError):
-        _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        def _verify(partition_str, num_layers, pp_size, goldens):
+            bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
+            m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
+            for pp_rank, golden in enumerate(goldens):
+                assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
+            if bak is not None:
+                m.setenv("VLLM_PP_LAYER_PARTITION", bak)
+
+        # Even partition
+        _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Balanced partition
+        _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
+        # Put reminder somewhere
+        _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
+        # Invalid partition strings
+        with pytest.raises(ValueError):
+            _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        with pytest.raises(ValueError):
+            _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Wrong number of partitions
+        with pytest.raises(ValueError):
+            _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Wrong number of layers
+        with pytest.raises(ValueError):
+            _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
 
 
 @pytest.mark.parametrize(
@@ -55,6 +57,10 @@ def test_custom_layer_partition():
         (5, 3, 1, (2, 4)),
         (5, 3, 2, (4, 5)),
     ])
-def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int,
-                               pp_rank: int, indices: tuple[int, int]):
+def test_uneven_auto_partition(
+    num_hidden_layers: int,
+    pp_size: int,
+    pp_rank: int,
+    indices: tuple[int, int],
+):
     assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index 3bc85b05e7d15..19414971f2b46 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -1,11 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
 
-import os
+from typing import TYPE_CHECKING
 
 import pytest
 
 from ..utils import compare_two_settings, fork_new_process_for_each_test
 
+if TYPE_CHECKING:
+    from typing_extensions import LiteralString
+
 
 @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
     (2, "JackFram/llama-160m"),
@@ -15,18 +19,24 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test
     "FLASHINFER",
 ])
 @fork_new_process_for_each_test
-def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
-    cudagraph_args = [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "float16",
-        "--pipeline-parallel-size",
-        str(PP_SIZE),
-        "--distributed-executor-backend",
-        "mp",
-    ]
-    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
+def test_pp_cudagraph(
+    monkeypatch: pytest.MonkeyPatch,
+    PP_SIZE: int,
+    MODEL_NAME: str,
+    ATTN_BACKEND: LiteralString,
+):
+    with monkeypatch.context() as m:
+        cudagraph_args = [
+            # use half precision for speed and memory savings in CI environment
+            "--dtype",
+            "float16",
+            "--pipeline-parallel-size",
+            str(PP_SIZE),
+            "--distributed-executor-backend",
+            "mp",
+        ]
+        m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
 
-    eager_args = cudagraph_args + ["--enforce-eager"]
+        eager_args = cudagraph_args + ["--enforce-eager"]
 
-    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
+        compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 3ebc5a44d80c6..77fbb5827da9e 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -49,7 +49,7 @@ TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
 @pytest.mark.skipif(not current_platform.is_cuda()
                     and not current_platform.is_tpu(),
                     reason="V1 is currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
     """Run with the V1 Engine."""
 
     with monkeypatch.context() as m:
@@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
         run_test(more_args)
 
 
-def test_lm_eval_accuracy_v0_engine(monkeypatch):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
     """Run with the V0 Engine."""
 
     with monkeypatch.context() as m:
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index 85156d6931c8c..23fd72f4ebbb9 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -53,32 +53,37 @@ def cache_models():
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.usefixtures("cache_models")
-def test_offline_mode(monkeypatch):
+def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
     # Set HF to offline mode and ensure we can still construct an LLM
-    try:
-        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
-        monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1")
+    with monkeypatch.context() as m:
+        try:
+            m.setenv("HF_HUB_OFFLINE", "1")
+            m.setenv("VLLM_NO_USAGE_STATS", "1")
 
-        def disable_connect(*args, **kwargs):
-            raise RuntimeError("No http calls allowed")
+            def disable_connect(*args, **kwargs):
+                raise RuntimeError("No http calls allowed")
 
-        monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect",
-                            disable_connect)
-        monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect",
-                            disable_connect)
+            m.setattr(
+                urllib3.connection.HTTPConnection,
+                "connect",
+                disable_connect,
+            )
+            m.setattr(
+                urllib3.connection.HTTPSConnection,
+                "connect",
+                disable_connect,
+            )
 
-        # Need to re-import huggingface_hub and friends to setup offline mode
-        _re_import_modules()
-        # Cached model files should be used in offline mode
-        for model_config in MODEL_CONFIGS:
-            LLM(**model_config)
-    finally:
-        # Reset the environment after the test
-        # NB: Assuming tests are run in online mode
-        monkeypatch.delenv("HF_HUB_OFFLINE")
-        monkeypatch.delenv("VLLM_NO_USAGE_STATS")
-        _re_import_modules()
-        pass
+            # Need to re-import huggingface_hub
+            # and friends to setup offline mode
+            _re_import_modules()
+            # Cached model files should be used in offline mode
+            for model_config in MODEL_CONFIGS:
+                LLM(**model_config)
+        finally:
+            # Reset the environment after the test
+            # NB: Assuming tests are run in online mode
+            _re_import_modules()
 
 
 def _re_import_modules():
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index e4c087db3d4f0..d3948e2ed575e 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -70,7 +70,7 @@ def run_test(more_args):
 @pytest.mark.skipif(not current_platform.is_cuda()
                     and not current_platform.is_tpu(),
                     reason="V1 currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
     """Run with the V1 Engine."""
 
     with monkeypatch.context() as m:
@@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
 
 
 @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
-def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
+                                    more_args):
     """Run with the V0 Engine."""
 
     with monkeypatch.context() as m:
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 570e643e0364d..66db7509cc474 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -5,13 +5,12 @@ from unittest.mock import Mock, patch
 import pytest
 import torch
 
-from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.openvino import OpenVinoPlatform
 from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
 
 @pytest.fixture(autouse=True)
@@ -25,87 +24,111 @@ def clear_cache():
     "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
 @pytest.mark.parametrize("use_v1", [True, False])
 @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
-def test_env(name: str, use_v1: bool, device: str, monkeypatch):
+def test_env(
+    name: str,
+    use_v1: bool,
+    device: str,
+    monkeypatch: pytest.MonkeyPatch,
+):
     """Test that the attention selector can be set via environment variable.
     Note that we do not test FlashAttn because it is the default backend.
     """
 
-    monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-    override_backend_env_variable(monkeypatch, name)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv(STR_BACKEND_ENV_VAR, name)
 
-    if device == "cpu":
-        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
-            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
-                                       False)
-        assert backend.get_name() == "TORCH_SDPA"
-    elif device == "hip":
-        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
-            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
-                                       False)
-        EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
-        assert backend.get_name() == EXPECTED
-    elif device == "openvino":
-        with patch("vllm.attention.selector.current_platform",
-                   OpenVinoPlatform()), patch.dict('sys.modules',
-                                                   {'openvino': Mock()}):
-            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
-                                       False)
-        assert backend.get_name() == "OPENVINO"
-    else:
-        if name in ["XFORMERS", "FLASHINFER"]:
+        if device == "cpu":
             with patch("vllm.attention.selector.current_platform",
-                       CudaPlatform()):
+                       CpuPlatform()):
                 backend = get_attn_backend(16, torch.float16, torch.float16,
                                            16, False)
-            EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+            assert backend.get_name() == "TORCH_SDPA"
+        elif device == "hip":
+            with patch("vllm.attention.selector.current_platform",
+                       RocmPlatform()):
+                backend = get_attn_backend(16, torch.float16, torch.float16,
+                                           16, False)
+            EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
             assert backend.get_name() == EXPECTED
+        elif device == "openvino":
+            with patch("vllm.attention.selector.current_platform",
+                       OpenVinoPlatform()), patch.dict('sys.modules',
+                                                       {'openvino': Mock()}):
+                backend = get_attn_backend(16, torch.float16, torch.float16,
+                                           16, False)
+            assert backend.get_name() == "OPENVINO"
+        else:
+            if name in ["XFORMERS", "FLASHINFER"]:
+                with patch("vllm.attention.selector.current_platform",
+                           CudaPlatform()):
+                    backend = get_attn_backend(16, torch.float16,
+                                               torch.float16, 16, False)
+                EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+                assert backend.get_name() == EXPECTED
 
 
-def test_flash_attn(monkeypatch):
+def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
     """Test FlashAttn validation."""
     # TODO: When testing for v1, pipe in `use_v1` as an argument to
     # get_attn_backend
 
-    override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
 
-    # Unsupported CUDA arch
-    with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
+        # Unsupported CUDA arch
+        monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
+                            (7, 5))
         backend = get_attn_backend(16, torch.float16, None, 16, False)
         assert backend.get_name() != STR_FLASH_ATTN_VAL
 
-    # Unsupported data type
-    backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        # Reset the monkeypatch for subsequent tests
+        monkeypatch.undo()
 
-    # Unsupported kv cache data type
-    backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        # Unsupported data type
+        backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
 
-    # Unsupported block size
-    backend = get_attn_backend(16, torch.float16, None, 8, False)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        # Unsupported kv cache data type
+        backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
 
-    # flash-attn is not installed
-    with patch.dict('sys.modules', {'vllm_flash_attn': None}):
+        # Unsupported block size
+        backend = get_attn_backend(16, torch.float16, None, 8, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # flash-attn is not installed
+        import sys
+        original_module = sys.modules.get('vllm_flash_attn')
+        monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
         backend = get_attn_backend(16, torch.float16, None, 16, False)
         assert backend.get_name() != STR_FLASH_ATTN_VAL
 
-    # Unsupported head size
-    backend = get_attn_backend(17, torch.float16, None, 16, False)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        # Restore the original module if it existed
+        if original_module is not None:
+            monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
+                                original_module)
+        else:
+            monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
 
-    # Attention-free models should bypass env and use PlaceholderAttention
-    backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        # Unsupported head size
+        backend = get_attn_backend(17, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Attention-free models should bypass env and use PlaceholderAttention
+        backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
 
 
 @pytest.mark.parametrize("use_v1", [True, False])
-def test_invalid_env(use_v1: bool, monkeypatch):
-    """Ignore the invalid env variable if it is set."""
-    monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
+def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
 
-    with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+    with monkeypatch.context() as m, patch(
+            "vllm.attention.selector.current_platform", CudaPlatform()):
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
+
+        # Test with head size 32
         backend = get_attn_backend(32, torch.float16, None, 16, False)
         EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
         assert backend.get_name() == EXPECTED
diff --git a/tests/kernels/test_awq.py b/tests/kernels/test_awq.py
index 37ce00c74030a..248b294e546b3 100644
--- a/tests/kernels/test_awq.py
+++ b/tests/kernels/test_awq.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import os
-
 import pytest
 import torch
 
@@ -11,36 +9,38 @@ from vllm import _custom_ops as ops  # noqa: F401
 
 @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
                     reason="AWQ is not supported on this GPU type.")
-def test_awq_dequantize_opcheck():
-    os.environ["VLLM_USE_TRITON_AWQ"] = "0"
-    qweight = torch.randint(-2000000000,
-                            2000000000, (8192, 256),
-                            device='cuda',
-                            dtype=torch.int32)
-    scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
-    zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
-    split_k_iters = 0
-    thx = 0
-    thy = 0
-    opcheck(torch.ops._C.awq_dequantize,
-            (qweight, scales, zeros, split_k_iters, thx, thy))
+def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_TRITON_AWQ", "0")
+        qweight = torch.randint(-2000000000,
+                                2000000000, (8192, 256),
+                                device='cuda',
+                                dtype=torch.int32)
+        scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
+        zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
+        split_k_iters = 0
+        thx = 0
+        thy = 0
+        opcheck(torch.ops._C.awq_dequantize,
+                (qweight, scales, zeros, split_k_iters, thx, thy))
 
 
 @pytest.mark.skip(reason="Not working; needs investigation.")
 @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
                     reason="AWQ is not supported on this GPU type.")
-def test_awq_gemm_opcheck():
-    os.environ["VLLM_USE_TRITON_AWQ"] = "0"
-    input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
-    qweight = torch.randint(-2000000000,
-                            2000000000, (8192, 256),
-                            device='cuda',
-                            dtype=torch.int32)
-    scales = torch.randint(-2000000000,
-                           2000000000, (64, 256),
-                           device='cuda',
-                           dtype=torch.int32)
-    qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
-    split_k_iters = 8
-    opcheck(torch.ops._C.awq_gemm,
-            (input, qweight, qzeros, scales, split_k_iters))
+def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_TRITON_AWQ", "0")
+        input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
+        qweight = torch.randint(-2000000000,
+                                2000000000, (8192, 256),
+                                device='cuda',
+                                dtype=torch.int32)
+        scales = torch.randint(-2000000000,
+                               2000000000, (64, 256),
+                               device='cuda',
+                               dtype=torch.int32)
+        qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
+        split_k_iters = 8
+        opcheck(torch.ops._C.awq_gemm,
+                (input, qweight, qzeros, scales, split_k_iters))
diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py
index 7cd6082486605..724f0af283f70 100644
--- a/tests/kernels/test_rocm_attention_selector.py
+++ b/tests/kernels/test_rocm_attention_selector.py
@@ -1,13 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from unittest.mock import patch
-
 import pytest
 import torch
 
-from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.rocm import RocmPlatform
+from vllm.utils import STR_BACKEND_ENV_VAR
 
 
 @pytest.fixture(autouse=True)
@@ -17,15 +15,19 @@ def clear_cache():
     _cached_get_attn_backend.cache_clear()
 
 
-def test_selector(monkeypatch):
-    """Test that the attention selector for ROCm.
-    """
-    override_backend_env_variable(monkeypatch, "ROCM_FLASH")
+def test_selector(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
 
-    with patch("vllm.attention.selector.current_platform", RocmPlatform()):
+        # Set the current platform to ROCm using monkeypatch
+        monkeypatch.setattr("vllm.attention.selector.current_platform",
+                            RocmPlatform())
+
+        # Test standard ROCm attention
         backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
         assert (backend.get_name() == "ROCM_FLASH"
                 or backend.get_name() == "ROCM_ATTN_VLLM_V1")
+
         # mla test for deepseek related
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
                                    False, True)
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 010974076ba8f..22b3d7c2be7a5 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -36,12 +36,12 @@ ALL_OPCHECK_TEST_UTILS: tuple[str, ...] = (
 
 class QKVInputs(NamedTuple):
     '''
-    Data structure for representing unpacked attention inputs, 
+    Data structure for representing unpacked attention inputs,
     query/key/values and their sequence lengths.
 
     Attributes:
 
-        * {query,key,value}: unpacked (batch_size x padded_seq_len x 
+        * {query,key,value}: unpacked (batch_size x padded_seq_len x
                              num_heads x head_size) attention inputs
         * q_seq_lens: query sequence lengths list
         * kv_seq_lens: shared key/value sequence lengths list
@@ -56,14 +56,14 @@ class QKVInputs(NamedTuple):
 
 class QKVO(NamedTuple):
     '''
-    Data structure for representing unpacked attention inputs, 
+    Data structure for representing unpacked attention inputs,
     alongside unpacked known-correct attention output
 
     Attributes:
 
-        * qkv: unpacked (batch_size x padded_seq_len x 
+        * qkv: unpacked (batch_size x padded_seq_len x
                              num_heads x head_size) attention inputs
-        * ideal_output: unpacked (batch_size x padded_seq_len x 
+        * ideal_output: unpacked (batch_size x padded_seq_len x
                         num_heads x head_size) known-correct attention output
     '''
 
@@ -77,7 +77,7 @@ class PackedQKVInputs(NamedTuple):
 
     Attributes:
 
-        * {query,key,value}: packed (number_of_tokens x num_heads 
+        * {query,key,value}: packed (number_of_tokens x num_heads
                              x head_size) attention inputs
         * q_start_loc_list: list of query start locations within packed tensor
         * kv_start_loc_list: shared list of key/value start locations within
@@ -97,14 +97,14 @@ class PackedQKVInputs(NamedTuple):
 
 class PackedQKVO(NamedTuple):
     '''
-    Data structure for representing packed attention inputs, 
+    Data structure for representing packed attention inputs,
     alongside packed known-correct attention output
 
     Attributes:
 
-        * packed_qkv: packed (number_of_tokens x num_heads 
+        * packed_qkv: packed (number_of_tokens x num_heads
                       x head_size) attention inputs
-        * ideal_output: packed (number_of_tokens x num_heads 
+        * ideal_output: packed (number_of_tokens x num_heads
                         x head_size) known-correct attention output
     '''
 
@@ -134,7 +134,7 @@ class PhaseTestParameters(NamedTuple):
 
     Attributes:
 
-        * packed_qkvo: packed (number_of_tokens x num_heads 
+        * packed_qkvo: packed (number_of_tokens x num_heads
                        x head_size) attention inputs & known-correct
                        output
         * kv_mmap: KV cache memory mapping, specific to this test phase &
@@ -195,7 +195,7 @@ def make_causal_mask(
     Create a q_max_seq_len x kv_max_seq_len causal mask
 
     Arguments:
-    
+
     * q_max_seq_len: query max seq len
     * kv_max_seq_len: key/value max seq len
 
@@ -320,9 +320,9 @@ def make_qkv(
     * max_kv_seq_len: max key/value seq len
     * num_heads
     * head_size
-    * is_encoder_decoder_attn: if True, query seqlen may differ from 
-      key/value seqlen (as is often the case for cross-attention); 
-      o/w, query/key/value seqlens match at each batch index 
+    * is_encoder_decoder_attn: if True, query seqlen may differ from
+      key/value seqlen (as is often the case for cross-attention);
+      o/w, query/key/value seqlens match at each batch index
       (max_kv_seq_len is unused)
     * force_kv_seq_lens: if not None, overrides kv sequence lengths
     * attn_type: encoder, decoder self, or enc/dec cross attention
@@ -469,7 +469,7 @@ def pack_qkv(qkv: QKVInputs, device: Union[torch.device,
     Individually pack each of Q, K and V, each with dimensions batch_size x
     padded_seq_len x num_heads x head_size, into respective number_of_tokens x
     num_heads x head_size tensors.
-    
+
     For Q, number_of_tokens = sum(q_seq_lens).
 
     For K and V, number_of_tokens = sum(kv_seq_lens)
@@ -619,9 +619,9 @@ def make_kv_cache(num_blocks: int,
     Returns:
 
     * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
-    *     for backend 'XFORMERS' 
+    *     for backend 'XFORMERS'
     * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
-    *     for backend 'FLASH_ATTN'  
+    *     for backend 'FLASH_ATTN'
     '''
     if backend == 'XFORMERS':
         kv_cache = torch.rand(
@@ -662,20 +662,20 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
     Context:
     * Your goal is to test (1) prefill of N prompts, with prompt-lengths
       {K_i \\forall i \\in [0,N)}, followed by (2) decoding of a single token
-      for all N prompts (N tokens total); the resultant sequence lengths 
+      for all N prompts (N tokens total); the resultant sequence lengths
       after decode would be {K_i + 1 for i \\in [0,N)}
-    * The test you want to do requires (1) having the prefill slot mapping 
-      for all tokens present during prefill, the number of which is 
-      M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N 
+    * The test you want to do requires (1) having the prefill slot mapping
+      for all tokens present during prefill, the number of which is
+      M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N
       decoded tokens
-    
-    This function consumes a single 1D slot mapping, which is the 
+
+    This function consumes a single 1D slot mapping, which is the
     concatenation of N slot mappings each of length K_i + 1 (corresponding
     to the  sequence lengths after decode), with a total length of
     P = \\sum_i{K_i + 1} = M + N
 
     The prefill-phase slot mapping results from excising the (K_i + 1)-th entry
-    from each of the N subsequences in the slot mapping (i.e. omitting the 
+    from each of the N subsequences in the slot mapping (i.e. omitting the
     decoded token's mapping.)
 
     The N excised entries are appended to obtain the decode-phase slot mapping
@@ -684,15 +684,15 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
 
     * slot_mapping_list: Length-P 1D slot mapping (as list) reflecting all N
       post-decode sequences
-    * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the 
+    * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the
       description above)
     * device: cuda, cpu, etc.
 
     Returns:
 
-    * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor) 
+    * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor)
       reflecting all N prefill prompts
-    * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting 
+    * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting
       all N decoded tokens
     '''
 
@@ -725,7 +725,7 @@ def make_block_tables_slot_mapping(
 
     Then the minimum KV cache size in blocks is
 
-    total_cache_blocks = sum(num_blocks for all seqs) 
+    total_cache_blocks = sum(num_blocks for all seqs)
 
     Then, the blocktable mapping counts downward from
 
@@ -734,7 +734,7 @@ def make_block_tables_slot_mapping(
     to
 
     block_base_addr
-    
+
 
     The constructed block-tables and slot-mapping are sized to the
     lengths of the sequences in their entirety (as reflected by seq_lens),
@@ -749,7 +749,7 @@ def make_block_tables_slot_mapping(
 
     Return:
 
-    * block_tables_tensor: block table for sequence   
+    * block_tables_tensor: block table for sequence
     * slot_mapping_list: slot mapping for sequence
     * max_block_idx: the highest block address within this block table
     '''
@@ -807,7 +807,7 @@ def make_test_metadata(
     encoder_test_params and cross_test_params arguments allow encoder
     attention and enc/dec cross-attention (respectively) to use distinct
     metadata values from decoder self-attention (decoder_test_params.)
-    
+
     if encoder_test_params and cross_test_params are None, the attention
     metadata will support decoder-only scenario.
 
@@ -820,7 +820,7 @@ def make_test_metadata(
     * attn_backend_name: Backend for sourcing attention kernels
     * is_prompt: prefill if True, o/w decode
     * seq_lens: list of token counts for each sequence
-    * decoder_test_params: decoder self-attention test params; 
+    * decoder_test_params: decoder self-attention test params;
                            this function requires
                            kv_mmap (memory mapping) field
     * device: CPU or CUDA device
diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/test_disagg.py
similarity index 100%
rename from tests/kv_transfer/disagg_test.py
rename to tests/kv_transfer/test_disagg.py
diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/test_module.py
similarity index 100%
rename from tests/kv_transfer/module_test.py
rename to tests/kv_transfer/test_module.py
diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py
index faca7a566e79c..51abcb7172cb7 100644
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -12,11 +12,10 @@ import pytest
 from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
 from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR
 
 from ...utils import check_logprobs_close
 
-os.environ["TOKENIZERS_PARALLELISM"] = "true"
-
 
 @pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
@@ -55,45 +54,47 @@ def test_models(
     backend: str,
     tensor_parallel_size: int,
     disable_async_output_proc: bool,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """
     Only checks log probs match to cover the discrepancy in
     numerical sensitive kernels.
     """
-    override_backend_env_variable(monkeypatch, backend)
+    with monkeypatch.context() as m:
+        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+        m.setenv(STR_BACKEND_ENV_VAR, backend)
 
-    MAX_MODEL_LEN = 1024
-    NUM_LOG_PROBS = 8
+        MAX_MODEL_LEN = 1024
+        NUM_LOG_PROBS = 8
 
-    with vllm_runner(
-            base_model,
-            max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            kv_cache_dtype="auto",
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        baseline_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
+        with vllm_runner(
+                base_model,
+                max_model_len=MAX_MODEL_LEN,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                kv_cache_dtype="auto",
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            baseline_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
 
-    with vllm_runner(
-            test_model,
-            max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        test_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
+        with vllm_runner(
+                test_model,
+                max_model_len=MAX_MODEL_LEN,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                kv_cache_dtype=kv_cache_dtype,
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            test_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
 
-    check_logprobs_close(
-        outputs_0_lst=baseline_outputs,
-        outputs_1_lst=test_outputs,
-        name_0="fp16_kv_cache",
-        name_1="fp8_kv_cache",
-    )
+        check_logprobs_close(
+            outputs_0_lst=baseline_outputs,
+            outputs_1_lst=test_outputs,
+            name_0="fp16_kv_cache",
+            name_1="fp8_kv_cache",
+        )
 
 
 @pytest.mark.cpu_model
@@ -119,38 +120,41 @@ def test_cpu_models(
     test_model: str,
     max_tokens: int,
     disable_async_output_proc: bool,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """
     Only checks log probs match to cover the discrepancy in
     numerical sensitive kernels.
     """
+    with monkeypatch.context() as m:
+        m.setenv("TOKENIZERS_PARALLELISM", 'true')
 
-    MAX_MODEL_LEN = 1024
-    NUM_LOG_PROBS = 8
+        MAX_MODEL_LEN = 1024
+        NUM_LOG_PROBS = 8
 
-    with vllm_runner(
-            base_model,
-            max_model_len=MAX_MODEL_LEN,
-            dtype="bfloat16",
-            kv_cache_dtype="auto",
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        baseline_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
+        with vllm_runner(
+                base_model,
+                max_model_len=MAX_MODEL_LEN,
+                dtype="bfloat16",
+                kv_cache_dtype="auto",
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            baseline_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
 
-    with vllm_runner(
-            test_model,
-            max_model_len=MAX_MODEL_LEN,
-            dtype="bfloat16",
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        test_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
+        with vllm_runner(
+                test_model,
+                max_model_len=MAX_MODEL_LEN,
+                dtype="bfloat16",
+                kv_cache_dtype=kv_cache_dtype,
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            test_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
 
-    check_logprobs_close(
-        outputs_0_lst=baseline_outputs,
-        outputs_1_lst=test_outputs,
-        name_0="bf16_kv_cache",
-        name_1="fp8_kv_cache",
-    )
+        check_logprobs_close(
+            outputs_0_lst=baseline_outputs,
+            outputs_1_lst=test_outputs,
+            name_0="bf16_kv_cache",
+            name_1="fp8_kv_cache",
+        )
diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
index cae3e1a5c6244..d6bf7d2706397 100644
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
 
 import importlib.util
 import math
@@ -11,6 +12,7 @@ from scipy.spatial.distance import cosine
 
 import vllm
 import vllm.config
+from vllm.utils import STR_BACKEND_ENV_VAR
 
 from ....utils import RemoteOpenAIServer
 
@@ -29,36 +31,34 @@ def _arr(arr):
     return array("i", arr)
 
 
-def test_find_array(monkeypatch):
+def test_find_array(monkeypatch: pytest.MonkeyPatch):
     # GritLM embedding implementation is only supported by XFormers backend.
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
 
-    from vllm.model_executor.models.gritlm import GritLMPooler
+        from vllm.model_executor.models.gritlm import GritLMPooler
 
-    # Create an LLM object to get the model config.
-    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
-    pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
+        # Create an LLM object to get the model config.
+        llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+        pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
 
-    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+        arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
 
-    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
-    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
-    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
-    assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
+        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+        assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
 
-    with pytest.raises(ValueError):
-        pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
+        with pytest.raises(ValueError):
+            pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
 
 
 @pytest.fixture(scope="module")
 def server_embedding():
     # GritLM embedding implementation is only supported by XFormers backend.
-    with pytest.MonkeyPatch.context() as mp:
-        mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
-
-        args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
-        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-            yield remote_server
+    args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
 
 
 @pytest.fixture(scope="module")
@@ -69,9 +69,12 @@ def server_generate():
 
 
 @pytest_asyncio.fixture
-async def client_embedding(server_embedding: RemoteOpenAIServer):
-    async with server_embedding.get_async_client() as async_client:
-        yield async_client
+async def client_embedding(monkeypatch: pytest.MonkeyPatch,
+                           server_embedding: RemoteOpenAIServer):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+        async with server_embedding.get_async_client() as async_client:
+            yield async_client
 
 
 @pytest_asyncio.fixture
@@ -80,14 +83,20 @@ async def client_generate(server_generate: RemoteOpenAIServer):
         yield async_client
 
 
-def run_llm_encode(llm: vllm.LLM, queries: list[str],
-                   instruction: str) -> list[float]:
+def run_llm_encode(
+    llm: vllm.LLM,
+    queries: list[str],
+    instruction: str,
+) -> list[float]:
     outputs = llm.encode([instruction + q for q in queries], )
     return [output.outputs.embedding for output in outputs]
 
 
-async def run_client_embeddings(client: vllm.LLM, queries: list[str],
-                                instruction: str) -> list[float]:
+async def run_client_embeddings(
+    client: vllm.LLM,
+    queries: list[str],
+    instruction: str,
+) -> list[float]:
     outputs = await client.embeddings.create(
         model=MODEL_NAME,
         input=[instruction + q for q in queries],
@@ -106,7 +115,7 @@ def get_test_data():
     README.md in https://github.com/ContextualAI/gritlm
     """
     q_instruction = gritlm_instruction(
-        "Given a scientific paper title, retrieve the paper's abstract")
+        "Given a scientific paper title, retrieve the paper's abstract", )
     queries = [
         "Bitcoin: A Peer-to-Peer Electronic Cash System",
         "Generative Representational Instruction Tuning",
@@ -136,31 +145,32 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
     assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
 
 
-def test_gritlm_offline_embedding(monkeypatch):
+def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
     # GritLM embedding implementation is only supported by XFormers backend.
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
 
-    queries, q_instruction, documents, d_instruction = get_test_data()
+        queries, q_instruction, documents, d_instruction = get_test_data()
 
-    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+        llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
 
-    d_rep = run_llm_encode(
-        llm,
-        documents,
-        d_instruction,
-    )
-    q_rep = run_llm_encode(
-        llm,
-        queries,
-        q_instruction,
-    )
+        d_rep = run_llm_encode(
+            llm,
+            documents,
+            d_instruction,
+        )
+        q_rep = run_llm_encode(
+            llm,
+            queries,
+            q_instruction,
+        )
 
-    validate_embed_output(q_rep, d_rep)
+        validate_embed_output(q_rep, d_rep)
 
 
 @pytest.mark.asyncio
 async def test_gritlm_api_server_embedding(
-        client_embedding: openai.AsyncOpenAI):
+    client_embedding: openai.AsyncOpenAI, ):
     queries, q_instruction, documents, d_instruction = get_test_data()
 
     d_rep = await run_client_embeddings(
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index d3d07d0d9acfc..465c496f4c0f3 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import os
-
 import pytest
 
 from vllm import LLM, SamplingParams
@@ -11,76 +9,92 @@ from ..utils import fork_new_process_for_each_test
 
 
 @fork_new_process_for_each_test
-def test_plugin(dummy_opt_path, monkeypatch):
+def test_plugin(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_opt_path: str,
+):
     # V1 shuts down rather than raising an error here.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-    os.environ["VLLM_PLUGINS"] = ""
-    with pytest.raises(Exception) as excinfo:
-        LLM(model=dummy_opt_path, load_format="dummy")
-    error_msg = "has no vLLM implementation and " \
-                "the Transformers implementation is not compatible with vLLM"
-    assert (error_msg in str(excinfo.value))
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        m.setenv("VLLM_PLUGINS", "")
+
+        with pytest.raises(Exception) as excinfo:
+            LLM(model=dummy_opt_path, load_format="dummy")
+        error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM"  # noqa: E501
+        assert (error_msg in str(excinfo.value))
 
 
 @fork_new_process_for_each_test
-def test_oot_registration_text_generation(dummy_opt_path):
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
-    prompts = ["Hello, my name is", "The text does not matter"]
-    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model=dummy_opt_path, load_format="dummy")
-    first_token = llm.get_tokenizer().decode(0)
-    outputs = llm.generate(prompts, sampling_params)
+def test_oot_registration_text_generation(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_opt_path: str,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
+        prompts = ["Hello, my name is", "The text does not matter"]
+        sampling_params = SamplingParams(temperature=0)
+        llm = LLM(model=dummy_opt_path, load_format="dummy")
+        first_token = llm.get_tokenizer().decode(0)
+        outputs = llm.generate(prompts, sampling_params)
 
-    for output in outputs:
-        generated_text = output.outputs[0].text
-        # make sure only the first token is generated
-        rest = generated_text.replace(first_token, "")
-        assert rest == ""
+        for output in outputs:
+            generated_text = output.outputs[0].text
+            # make sure only the first token is generated
+            rest = generated_text.replace(first_token, "")
+            assert rest == ""
 
 
 @fork_new_process_for_each_test
-def test_oot_registration_embedding(dummy_gemma2_embedding_path):
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
-    prompts = ["Hello, my name is", "The text does not matter"]
-    llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
-    outputs = llm.embed(prompts)
+def test_oot_registration_embedding(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_gemma2_embedding_path: str,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
+        prompts = ["Hello, my name is", "The text does not matter"]
+        llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
+        outputs = llm.embed(prompts)
 
-    for output in outputs:
-        assert all(v == 0 for v in output.outputs.embedding)
+        for output in outputs:
+            assert all(v == 0 for v in output.outputs.embedding)
 
 
 image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
 
 
 @fork_new_process_for_each_test
-def test_oot_registration_multimodal(dummy_llava_path, monkeypatch):
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
-    prompts = [{
-        "prompt": "What's in the image?<image>",
-        "multi_modal_data": {
-            "image": image
-        },
-    }, {
-        "prompt": "Describe the image<image>",
-        "multi_modal_data": {
-            "image": image
-        },
-    }]
+def test_oot_registration_multimodal(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_llava_path: str,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
+        prompts = [{
+            "prompt": "What's in the image?<image>",
+            "multi_modal_data": {
+                "image": image
+            },
+        }, {
+            "prompt": "Describe the image<image>",
+            "multi_modal_data": {
+                "image": image
+            },
+        }]
 
-    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model=dummy_llava_path,
-              load_format="dummy",
-              max_num_seqs=1,
-              trust_remote_code=True,
-              gpu_memory_utilization=0.98,
-              max_model_len=4096,
-              enforce_eager=True,
-              limit_mm_per_prompt={"image": 1})
-    first_token = llm.get_tokenizer().decode(0)
-    outputs = llm.generate(prompts, sampling_params)
+        sampling_params = SamplingParams(temperature=0)
+        llm = LLM(model=dummy_llava_path,
+                  load_format="dummy",
+                  max_num_seqs=1,
+                  trust_remote_code=True,
+                  gpu_memory_utilization=0.98,
+                  max_model_len=4096,
+                  enforce_eager=True,
+                  limit_mm_per_prompt={"image": 1})
+        first_token = llm.get_tokenizer().decode(0)
+        outputs = llm.generate(prompts, sampling_params)
 
-    for output in outputs:
-        generated_text = output.outputs[0].text
-        # make sure only the first token is generated
-        rest = generated_text.replace(first_token, "")
-        assert rest == ""
+        for output in outputs:
+            generated_text = output.outputs[0].text
+            # make sure only the first token is generated
+            rest = generated_text.replace(first_token, "")
+            assert rest == ""
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index aad7fc5303c13..e617bd057f1f4 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -235,25 +235,28 @@ async def test_bad_request(tmp_socket):
 
 
 @pytest.mark.asyncio
-async def test_mp_crash_detection(monkeypatch):
+async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
 
-    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
-    parser = make_arg_parser(parser)
-    args = parser.parse_args([])
+        parser = FlexibleArgumentParser(
+            description="vLLM's remote OpenAI server.")
+        parser = make_arg_parser(parser)
+        args = parser.parse_args([])
 
-    # When LLMEngine is loaded, it will crash.
-    def mock_init():
-        raise ValueError
+        # When LLMEngine is loaded, it will crash.
+        def mock_init():
+            raise ValueError
 
-    monkeypatch.setattr(LLMEngine, "__init__", mock_init)
+        m.setattr(LLMEngine, "__init__", mock_init)
 
-    start = time.perf_counter()
-    async with build_async_engine_client(args):
-        pass
-    end = time.perf_counter()
+        start = time.perf_counter()
+        async with build_async_engine_client(args):
+            pass
+        end = time.perf_counter()
 
-    assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s "
-                              "if there is an error in the startup.")
+        assert end - start < 60, (
+            "Expected vLLM to gracefully shutdown in <60s "
+            "if there is an error in the startup.")
 
 
 @pytest.mark.asyncio
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index f925e42f46d37..ce716e6474cb4 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -5,7 +5,7 @@ from typing import Optional
 
 import pytest
 
-from tests.kernels.utils import override_backend_env_variable
+from vllm.utils import STR_BACKEND_ENV_VAR
 
 from ..models.utils import check_logprobs_close
 from ..utils import (completions_with_server_args, get_client_text_generations,
@@ -52,7 +52,7 @@ async def test_multi_step(
     num_logprobs: Optional[int],
     attention_backend: str,
     enable_chunked_prefill: bool,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
     client/server environment.
@@ -82,67 +82,70 @@ async def test_multi_step(
         pytest.skip("Multi-step with Chunked-Prefill only supports"
                     "PP=1 and FLASH_ATTN backend")
 
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
 
-    prompts = example_prompts
-    if len(prompts) < num_prompts:
-        prompts = prompts * ((num_prompts // len(prompts)) + 1)
-    prompts = prompts[:num_prompts]
-    assert len(prompts) == num_prompts
+        prompts = example_prompts
+        if len(prompts) < num_prompts:
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
+        prompts = prompts[:num_prompts]
+        assert len(prompts) == num_prompts
 
-    server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
-    ms_server_args = DEFAULT_SERVER_ARGS + \
-        ["--num-scheduler-steps", f"{num_scheduler_steps}"]
+        server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
+        ms_server_args = DEFAULT_SERVER_ARGS + \
+            ["--num-scheduler-steps", f"{num_scheduler_steps}"]
 
-    if not is_async:
-        ms_server_args += ["--disable-async-output-proc"]
+        if not is_async:
+            ms_server_args += ["--disable-async-output-proc"]
 
-    if eager_mode:
-        ms_server_args.append("--enforce-eager")
+        if eager_mode:
+            ms_server_args.append("--enforce-eager")
 
-    if enable_chunked_prefill:
-        ms_server_args.append("--enable-chunked-prefill")
+        if enable_chunked_prefill:
+            ms_server_args.append("--enable-chunked-prefill")
 
-    distributed_args = [
-        "--tensor-parallel-size",
-        str(tp_size),
-        "--pipeline-parallel-size",
-        str(pp_size),
-    ]
+        distributed_args = [
+            "--tensor-parallel-size",
+            str(tp_size),
+            "--pipeline-parallel-size",
+            str(pp_size),
+        ]
 
-    # Spin up client/server & issue completion API requests.
-    # Default `max_wait_seconds` is 240 but was empirically
-    # was raised 5x to 1200 *just for this test* due to
-    # observed timeouts in GHA CI
-    ref_completions = await completions_with_server_args(
-        prompts,
-        model,
-        server_args + distributed_args,
-        num_logprobs,
-        max_wait_seconds=5 * 240)
-    test_completions = await completions_with_server_args(
-        prompts,
-        model,
-        ms_server_args + distributed_args,
-        num_logprobs,
-        max_wait_seconds=5 * 240)
+        # Spin up client/server & issue completion API requests.
+        # Default `max_wait_seconds` is 240 but was empirically
+        # was raised 5x to 1200 *just for this test* due to
+        # observed timeouts in GHA CI
+        ref_completions = await completions_with_server_args(
+            prompts,
+            model,
+            server_args + distributed_args,
+            num_logprobs,
+            max_wait_seconds=5 * 240)
+        test_completions = await completions_with_server_args(
+            prompts,
+            model,
+            ms_server_args + distributed_args,
+            num_logprobs,
+            max_wait_seconds=5 * 240)
 
-    # Assert multi-step scheduling produces identical tokens
-    # to single-step scheduling.
-    ref_generations = get_client_text_generations(ref_completions)
-    test_generations = get_client_text_generations(test_completions)
-    assert ref_generations == test_generations
+        # Assert multi-step scheduling produces identical tokens
+        # to single-step scheduling.
+        ref_generations = get_client_text_generations(ref_completions)
+        test_generations = get_client_text_generations(test_completions)
+        assert ref_generations == test_generations
 
-    # Assert multi-step scheduling produces nearly-identical logprobs
-    # to single-step scheduling.
-    ref_text_logprobs = get_client_text_logprob_generations(ref_completions)
-    test_text_logprobs = get_client_text_logprob_generations(test_completions)
-    check_logprobs_close(
-        outputs_0_lst=ref_text_logprobs,
-        outputs_1_lst=test_text_logprobs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        # Assert multi-step scheduling produces nearly-identical logprobs
+        # to single-step scheduling.
+        ref_text_logprobs = get_client_text_logprob_generations(
+            ref_completions)
+        test_text_logprobs = get_client_text_logprob_generations(
+            test_completions)
+        check_logprobs_close(
+            outputs_0_lst=ref_text_logprobs,
+            outputs_1_lst=test_text_logprobs,
+            name_0="hf",
+            name_1="vllm",
+        )
 
 
 @pytest.mark.parametrize(("tp_size, pp_size"), [
@@ -152,7 +155,7 @@ async def test_multi_step(
 async def test_multi_step_pp_smoke(
     tp_size: int,
     pp_size: int,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """
     Smoke test for the vLLM engine with multi-step scheduling in an
@@ -174,54 +177,55 @@ async def test_multi_step_pp_smoke(
     attention_backend = "FLASH_ATTN"
     max_num_seqs = 3
 
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
 
-    # Prompt from the ShareGPT dataset
-    prompts = [
-        "in the jtbd context whats a push?",  # codespell:ignore
-        "in the jtbd context whats a push?",  # codespell:ignore
-        "in the jtbd context whats a push?",  # codespell:ignore
-        "in the jtbd context whats a push?",  # codespell:ignore
-    ]
-    # Use varying max_tokens to introduce scheduling randomness.
-    max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
-    assert len(prompts) == len(max_tokens)
+        # Prompt from the ShareGPT dataset
+        prompts = [
+            "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
+        ]
+        # Use varying max_tokens to introduce scheduling randomness.
+        max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
+        assert len(prompts) == len(max_tokens)
 
-    test_args = [
-        "--tensor-parallel-size",
-        str(tp_size), "--pipeline-parallel-size",
-        str(pp_size), "--max-num-seqs",
-        str(max_num_seqs)
-    ]
+        test_args = [
+            "--tensor-parallel-size",
+            str(tp_size), "--pipeline-parallel-size",
+            str(pp_size), "--max-num-seqs",
+            str(max_num_seqs)
+        ]
 
-    server_args = DEFAULT_SERVER_ARGS + test_args
-    ms_server_args = DEFAULT_SERVER_ARGS + \
-       ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
-       test_args
+        server_args = DEFAULT_SERVER_ARGS + test_args
+        ms_server_args = DEFAULT_SERVER_ARGS + \
+          ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
+          test_args
 
-    # Spin up client/server & issue completion API requests.
-    # Default `max_wait_seconds` is 240 but was empirically
-    # was raised 3x to 720 *just for this test* due to
-    # observed timeouts in GHA CI
-    ref_completions = await completions_with_server_args(
-        prompts=prompts,
-        model_name=model,
-        server_cli_args=server_args,
-        num_logprobs=None,
-        max_wait_seconds=5 * 240,
-        max_tokens=max_tokens)
+        # Spin up client/server & issue completion API requests.
+        # Default `max_wait_seconds` is 240 but was empirically
+        # was raised 3x to 720 *just for this test* due to
+        # observed timeouts in GHA CI
+        ref_completions = await completions_with_server_args(
+            prompts=prompts,
+            model_name=model,
+            server_cli_args=server_args,
+            num_logprobs=None,
+            max_wait_seconds=5 * 240,
+            max_tokens=max_tokens)
 
-    test_completions = await completions_with_server_args(
-        prompts=prompts,
-        model_name=model,
-        server_cli_args=ms_server_args,
-        num_logprobs=None,
-        max_wait_seconds=5 * 240,
-        max_tokens=max_tokens)
+        test_completions = await completions_with_server_args(
+            prompts=prompts,
+            model_name=model,
+            server_cli_args=ms_server_args,
+            num_logprobs=None,
+            max_wait_seconds=5 * 240,
+            max_tokens=max_tokens)
 
-    # Assert multi-step scheduling produces identical tokens
-    # to single-step scheduling.
-    ref_generations = get_client_text_generations(ref_completions)
-    test_generations = get_client_text_generations(test_completions)
+        # Assert multi-step scheduling produces identical tokens
+        # to single-step scheduling.
+        ref_generations = get_client_text_generations(ref_completions)
+        test_generations = get_client_text_generations(test_completions)
 
-    assert ref_generations == test_generations
+        assert ref_generations == test_generations
diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
index 29d5ffd4c9cb1..a823e484beab6 100644
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -7,7 +7,7 @@ from typing import Optional
 
 import pytest
 
-from tests.kernels.utils import override_backend_env_variable
+from vllm.utils import STR_BACKEND_ENV_VAR
 
 from ..models.utils import check_logprobs_close, check_outputs_equal
 
@@ -42,7 +42,7 @@ def test_multi_step_llm(
     num_prompts: int,
     num_logprobs: Optional[int],
     attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """Test vLLM engine with multi-step scheduling via sync LLM Engine.
 
@@ -70,48 +70,49 @@ def test_multi_step_llm(
       num_logprobs: corresponds to the `logprobs` argument to the OpenAI
                     completions endpoint; `None` -> 1 logprob returned.
     """
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
 
-    prompts = example_prompts
-    if len(prompts) < num_prompts:
-        prompts = prompts * ((num_prompts // len(prompts)) + 1)
-    prompts = prompts[:num_prompts]
-    assert len(prompts) == num_prompts
+        prompts = example_prompts
+        if len(prompts) < num_prompts:
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
+        prompts = prompts[:num_prompts]
+        assert len(prompts) == num_prompts
 
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            enable_chunked_prefill=enable_chunked_prefill,
-            num_scheduler_steps=num_scheduler_steps,
-    ) as vllm_model:
-        vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
-                        if num_logprobs is None else
-                        vllm_model.generate_greedy_logprobs(
-                            prompts, max_tokens, num_logprobs))
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                enable_chunked_prefill=enable_chunked_prefill,
+                num_scheduler_steps=num_scheduler_steps,
+        ) as vllm_model:
+            vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
+                            if num_logprobs is None else
+                            vllm_model.generate_greedy_logprobs(
+                                prompts, max_tokens, num_logprobs))
 
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
-                      if num_logprobs is None else
-                      hf_model.generate_greedy_logprobs_limit(
-                          prompts, max_tokens, num_logprobs))
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
+                          if num_logprobs is None else
+                          hf_model.generate_greedy_logprobs_limit(
+                              prompts, max_tokens, num_logprobs))
 
-    if num_logprobs is None:
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-    else:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
+        if num_logprobs is None:
+            check_outputs_equal(
+                outputs_0_lst=hf_outputs,
+                outputs_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+            )
+        else:
+            check_logprobs_close(
+                outputs_0_lst=hf_outputs,
+                outputs_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+            )
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -136,7 +137,7 @@ def test_multi_step_llm_w_prompt_logprobs(
     num_logprobs: Optional[int],
     num_prompt_logprobs: Optional[int],
     attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """Test prompt logprobs with multi-step scheduling via sync LLM Engine.
 
@@ -166,47 +167,48 @@ def test_multi_step_llm_w_prompt_logprobs(
                            note that this argument is not supported by the
                            OpenAI completions endpoint.
     """
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
 
-    prompts = example_prompts
-    if len(prompts) < num_prompts:
-        prompts = prompts * ((num_prompts // len(prompts)) + 1)
-    prompts = prompts[:num_prompts]
-    assert len(prompts) == num_prompts
+        prompts = example_prompts
+        if len(prompts) < num_prompts:
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
+        prompts = prompts[:num_prompts]
+        assert len(prompts) == num_prompts
 
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            num_scheduler_steps=num_scheduler_steps,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            prompts,
-            max_tokens,
-            num_logprobs,
-            num_prompt_logprobs=num_prompt_logprobs)
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                num_scheduler_steps=num_scheduler_steps,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs,
+                num_prompt_logprobs=num_prompt_logprobs)
 
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-    ) as vllm_model:
-        single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
-            prompts,
-            max_tokens,
-            num_logprobs,
-            num_prompt_logprobs=num_prompt_logprobs)
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+        ) as vllm_model:
+            single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs,
+                num_prompt_logprobs=num_prompt_logprobs)
 
-    check_logprobs_close(
-        outputs_0_lst=single_step_vllm_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        check_logprobs_close(
+            outputs_0_lst=single_step_vllm_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -230,7 +232,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
     num_prompts: int,
     num_logprobs: Optional[int],
     attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
 
@@ -293,77 +295,78 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
     #
     # The Incorrect scheduling behavior - if it occurs - will cause an exception
     # in the model runner resulting from `do_sample=False`.
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
 
-    assert len(example_prompts) >= 2
-    challenge_prompts = copy.deepcopy(example_prompts)
-    challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
-                            'inference and serving engine for LLMs.\n'
-                            )  # 24 tok
-    challenge_prompts[1] = (
-        'Briefly describe the major milestones in the '
-        'development of artificial intelligence from 1950 to 2020.\n'
-    )  # 30 tok
+        assert len(example_prompts) >= 2
+        challenge_prompts = copy.deepcopy(example_prompts)
+        challenge_prompts[0] = (
+            'vLLM is a high-throughput and memory-efficient '
+            'inference and serving engine for LLMs.\n')  # 24 tok
+        challenge_prompts[1] = (
+            'Briefly describe the major milestones in the '
+            'development of artificial intelligence from 1950 to 2020.\n'
+        )  # 30 tok
 
-    # If necessary, adjust the length of `challenge_prompts` to match
-    # `num_prompts`
-    if len(challenge_prompts) < num_prompts:
-        challenge_prompts = (challenge_prompts *
-                             ((num_prompts // len(challenge_prompts)) + 1))
-    challenge_prompts = challenge_prompts[:num_prompts]
-    assert len(challenge_prompts) == num_prompts
+        # If necessary, adjust the length of `challenge_prompts` to match
+        # `num_prompts`
+        if len(challenge_prompts) < num_prompts:
+            challenge_prompts = (challenge_prompts *
+                                 ((num_prompts // len(challenge_prompts)) + 1))
+        challenge_prompts = challenge_prompts[:num_prompts]
+        assert len(challenge_prompts) == num_prompts
 
-    # Single-step scheduler baseline
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            num_scheduler_steps=num_scheduler_steps,
-            max_model_len=48,
-            max_num_batched_tokens=48,
-            max_num_seqs=4,
-            block_size=16,
-    ) as vllm_model:
-        outputs_baseline = (vllm_model.generate_greedy(
-            challenge_prompts, max_tokens) if num_logprobs is None else
-                            vllm_model.generate_greedy_logprobs(
-                                challenge_prompts, max_tokens, num_logprobs))
+        # Single-step scheduler baseline
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                num_scheduler_steps=num_scheduler_steps,
+                max_model_len=48,
+                max_num_batched_tokens=48,
+                max_num_seqs=4,
+                block_size=16,
+        ) as vllm_model:
+            outputs_baseline = (
+                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
+                num_logprobs is None else vllm_model.generate_greedy_logprobs(
+                    challenge_prompts, max_tokens, num_logprobs))
 
-    # multi-step+"single-step chunked prefill"+APC
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            enable_chunked_prefill=True,
-            enable_prefix_caching=True,
-            num_scheduler_steps=num_scheduler_steps,
-            max_model_len=48,
-            max_num_batched_tokens=48,
-            max_num_seqs=4,
-            block_size=16,
-    ) as vllm_model:
-        outputs_w_features = (vllm_model.generate_greedy(
-            challenge_prompts, max_tokens) if num_logprobs is None else
-                              vllm_model.generate_greedy_logprobs(
-                                  challenge_prompts, max_tokens, num_logprobs))
+        # multi-step+"single-step chunked prefill"+APC
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                enable_chunked_prefill=True,
+                enable_prefix_caching=True,
+                num_scheduler_steps=num_scheduler_steps,
+                max_model_len=48,
+                max_num_batched_tokens=48,
+                max_num_seqs=4,
+                block_size=16,
+        ) as vllm_model:
+            outputs_w_features = (
+                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
+                num_logprobs is None else vllm_model.generate_greedy_logprobs(
+                    challenge_prompts, max_tokens, num_logprobs))
 
-    if num_logprobs is None:
-        # No-logprobs test
-        check_outputs_equal(
-            outputs_0_lst=outputs_baseline,
-            outputs_1_lst=outputs_w_features,
-            name_0="multi-step",
-            name_1="multi-step+features",
-        )
-    else:
-        # Yes-logprobs test
-        check_logprobs_close(
-            outputs_0_lst=outputs_baseline,
-            outputs_1_lst=outputs_w_features,
-            name_0="multi-step",
-            name_1="multi-step+features",
-        )
+        if num_logprobs is None:
+            # No-logprobs test
+            check_outputs_equal(
+                outputs_0_lst=outputs_baseline,
+                outputs_1_lst=outputs_w_features,
+                name_0="multi-step",
+                name_1="multi-step+features",
+            )
+        else:
+            # Yes-logprobs test
+            check_logprobs_close(
+                outputs_0_lst=outputs_baseline,
+                outputs_1_lst=outputs_w_features,
+                name_0="multi-step",
+                name_1="multi-step+features",
+            )
diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py
index 30dcdd573edf3..033a36b4156b0 100644
--- a/tests/neuron/1_core/test_block_table.py
+++ b/tests/neuron/1_core/test_block_table.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-import os
 
 import neuronxcc.nki.language as nl
 import pytest
@@ -99,6 +98,7 @@ def ref_block_tables_transform(
 )
 @torch.inference_mode()
 def test_load_and_transform_block_tables(
+    monkeypatch: pytest.MonkeyPatch,
     num_tiles,
     num_blocks_per_tile,
     q_head_per_kv_head,
@@ -108,46 +108,46 @@ def test_load_and_transform_block_tables(
 
     device = xm.xla_device()
 
-    compiler_flags = [
+    compiler_flags_str = " ".join([
         "-O1",
         "--retry_failed_compilation",
-    ]
-    compiler_flags_str = " ".join(compiler_flags)
-    os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
+    ])
+    with monkeypatch.context() as m:
+        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
 
-    torch.manual_seed(10000)
-    torch.set_printoptions(sci_mode=False)
+        torch.manual_seed(10000)
+        torch.set_printoptions(sci_mode=False)
 
-    # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
-    B_P_SIZE = 128
-    if num_blocks_per_tile < B_P_SIZE:
-        assert B_P_SIZE % num_blocks_per_tile == 0
-        block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
-    else:
-        block_size_tiling_factor = 1
-    max_num_blocks = 100000
-    block_tables = torch.randint(
-        0,
-        max_num_blocks,
-        (num_tiles * num_blocks_per_tile, ),
-        dtype=torch.int32,
-    )
-    nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
-        block_tables.to(device=device),
-        num_tiles,
-        num_blocks_per_tile,
-        q_head_per_kv_head,
-        head_id,
-        block_size_tiling_factor,
-    ).cpu()
-    ref_out = ref_block_tables_transform(
-        block_tables,
-        num_tiles,
-        num_blocks_per_tile,
-        q_head_per_kv_head,
-        head_id,
-        block_size_tiling_factor,
-    )
-    assert (nki_out.shape == ref_out.shape
-            ), f"{nki_out.shape=} != {ref_out.shape=}"
-    assert torch.all(nki_out == ref_out)
+        # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
+        B_P_SIZE = 128
+        if num_blocks_per_tile < B_P_SIZE:
+            assert B_P_SIZE % num_blocks_per_tile == 0
+            block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
+        else:
+            block_size_tiling_factor = 1
+        max_num_blocks = 100000
+        block_tables = torch.randint(
+            0,
+            max_num_blocks,
+            (num_tiles * num_blocks_per_tile, ),
+            dtype=torch.int32,
+        )
+        nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
+            block_tables.to(device=device),
+            num_tiles,
+            num_blocks_per_tile,
+            q_head_per_kv_head,
+            head_id,
+            block_size_tiling_factor,
+        ).cpu()
+        ref_out = ref_block_tables_transform(
+            block_tables,
+            num_tiles,
+            num_blocks_per_tile,
+            q_head_per_kv_head,
+            head_id,
+            block_size_tiling_factor,
+        )
+        assert (nki_out.shape == ref_out.shape
+                ), f"{nki_out.shape=} != {ref_out.shape=}"
+        assert torch.all(nki_out == ref_out)
diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
index 326a1f82e9b30..37d6679f8d55b 100644
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -320,6 +320,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
     ])
 @torch.inference_mode()
 def test_contexted_kv_attention(
+    monkeypatch: pytest.MonkeyPatch,
     prefill_batch_size: int,
     decode_batch_size: int,
     num_heads: int,
@@ -329,7 +330,6 @@ def test_contexted_kv_attention(
     large_tile_size,
     mixed_precision: bool,
 ) -> None:
-    import os
 
     import torch_xla.core.xla_model as xm
 
@@ -340,174 +340,178 @@ def test_contexted_kv_attention(
 
     device = xm.xla_device()
 
-    compiler_flags = [
+    compiler_flags_str = " ".join([
         "-O1",
         "--retry_failed_compilation",
-    ]
-    compiler_flags_str = " ".join(compiler_flags)
-    os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
+    ])
+    with monkeypatch.context() as m:
+        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
 
-    torch.manual_seed(0)
-    torch.set_printoptions(sci_mode=False)
-    torch.set_default_device("cpu")
-    dtype = torch.float32
+        torch.manual_seed(0)
+        torch.set_printoptions(sci_mode=False)
+        torch.set_default_device("cpu")
+        dtype = torch.float32
 
-    min_ctx_len = 32
-    max_ctx_len = 1024
-    min_query_len = 16
-    max_query_len = 512
-    num_kv_heads = num_heads // num_queries_per_kv
-    (
-        query,
-        k_active,
-        v_active,
-        k_cache,
-        v_cache,
-        block_table,
-        key,
-        value,
-        query_lens,
-        seq_lens,
-    ) = sample_inputs(
-        prefill_batch_size=prefill_batch_size,
-        decode_batch_size=decode_batch_size,
-        min_query_len=min_query_len,
-        max_query_len=max_query_len,
-        min_ctx_len=min_ctx_len,
-        max_ctx_len=max_ctx_len,
-        block_size=block_size,
-        num_heads=num_heads,
-        num_kv_heads=num_kv_heads,
-        head_size=head_size,
-        dtype=dtype,
-    )
+        min_ctx_len = 32
+        max_ctx_len = 1024
+        min_query_len = 16
+        max_query_len = 512
+        num_kv_heads = num_heads // num_queries_per_kv
+        (
+            query,
+            k_active,
+            v_active,
+            k_cache,
+            v_cache,
+            block_table,
+            key,
+            value,
+            query_lens,
+            seq_lens,
+        ) = sample_inputs(
+            prefill_batch_size=prefill_batch_size,
+            decode_batch_size=decode_batch_size,
+            min_query_len=min_query_len,
+            max_query_len=max_query_len,
+            min_ctx_len=min_ctx_len,
+            max_ctx_len=max_ctx_len,
+            block_size=block_size,
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_size=head_size,
+            dtype=dtype,
+        )
 
-    output_ref = ref_context_attention(
-        query,
-        key,
-        value,
-        query_lens,
-        seq_lens,
-        head_size,
-        num_queries_per_kv,
-        return_max_reduce=False,
-    )
+        output_ref = ref_context_attention(
+            query,
+            key,
+            value,
+            query_lens,
+            seq_lens,
+            head_size,
+            num_queries_per_kv,
+            return_max_reduce=False,
+        )
 
-    # build neuron program
-    B_P_SIZE = 128
-    assert (large_tile_size >= B_P_SIZE
-            ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
+        # build neuron program
+        B_P_SIZE = 128
+        assert (large_tile_size >= B_P_SIZE
+                ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
 
-    def ceil_div(a, b):
-        return (a + b - 1) // b
+        def ceil_div(a, b):
+            return (a + b - 1) // b
 
-    def pad_to_multiple(a, b):
-        return ceil_div(a, b) * b
+        def pad_to_multiple(a, b):
+            return ceil_div(a, b) * b
 
-    def pad_to_next_power_of_2(a):
-        assert a > 0
-        return 2**int(a - 1).bit_length()
+        def pad_to_next_power_of_2(a):
+            assert a > 0
+            return 2**int(a - 1).bit_length()
 
-    # calculate input shapes
-    max_num_queries = pad_to_next_power_of_2(sum(query_lens))
-    context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
-    num_active_blocks = ceil_div(context_lens, block_size).sum().item()
-    num_active_blocks = pad_to_multiple(num_active_blocks,
-                                        large_tile_size // block_size)
-    context_kv_len = num_active_blocks * block_size
-    assert (context_kv_len %
+        # calculate input shapes
+        max_num_queries = pad_to_next_power_of_2(sum(query_lens))
+        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+        num_active_blocks = ceil_div(context_lens, block_size).sum().item()
+        num_active_blocks = pad_to_multiple(num_active_blocks,
+                                            large_tile_size // block_size)
+        context_kv_len = num_active_blocks * block_size
+        assert (
+            context_kv_len %
             large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
 
-    # pad QKV tensors
-    pad_dims = (
-        0,
-        0,
-        0,
-        0,
-        0,
-        max_num_queries - query.shape[0],
-    )
-    query = F.pad(query, pad_dims, "constant", 0)
-    k = F.pad(k_active, pad_dims, "constant", 0)
-    v = F.pad(v_active, pad_dims, "constant", 0)
-
-    # permute QKV tensors
-    # query: (1, n_heads, d, seq_q)
-    # key:   (1, n_kv_heads, d, seq_k)
-    # value: (1, n_kv_heads, seq_v, d)
-    query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
-    k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
-    v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
-    k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
-    v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
-
-    # transform block table
-    active_block_table = get_active_block_tables(
-        block_table.cpu(),
-        torch.tensor(query_lens).cpu(),
-        torch.tensor(seq_lens).cpu(),
-        block_size,
-        num_active_blocks,
-    )
-
-    # Build attention masks
-    prior_mask, active_mask = (
-        BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-            query_lens, seq_lens, block_size=block_size))
-    prior_mask_padded = F.pad(
-        prior_mask,
-        (
+        # pad QKV tensors
+        pad_dims = (
             0,
-            context_kv_len - prior_mask.shape[1],
             0,
-            max_num_queries - prior_mask.shape[0],
-        ),
-        "constant",
-        0,
-    ).bool()
-    active_mask_padded = F.pad(
-        active_mask,
-        (
             0,
-            max_num_queries - active_mask.shape[1],
             0,
-            max_num_queries - active_mask.shape[0],
-        ),
-        "constant",
-        0,
-    ).bool()
-    attn_mask = torch.concat([prior_mask_padded, active_mask_padded], dim=1)
+            0,
+            max_num_queries - query.shape[0],
+        )
+        query = F.pad(query, pad_dims, "constant", 0)
+        k = F.pad(k_active, pad_dims, "constant", 0)
+        v = F.pad(v_active, pad_dims, "constant", 0)
 
-    attn_mask = reorder_context_mask(attn_mask, large_tile_size, block_size)
+        # permute QKV tensors
+        # query: (1, n_heads, d, seq_q)
+        # key:   (1, n_kv_heads, d, seq_k)
+        # value: (1, n_kv_heads, seq_v, d)
+        query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+        k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+        v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
+        k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
+        v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
 
-    input_args = (
-        query.to(device=device),
-        k.to(device=device),
-        v.to(device=device),
-        k_cache.to(device=device),
-        v_cache.to(device=device),
-        active_block_table.to(device=device),
-        attn_mask.to(device=device),
-    )
-    input_kwargs = dict(
-        n_kv_head=num_kv_heads,
-        head_size=head_size,
-        mixed_precision=mixed_precision,
-        LARGE_TILE_SZ=large_tile_size,
-    )
+        # transform block table
+        active_block_table = get_active_block_tables(
+            block_table.cpu(),
+            torch.tensor(query_lens).cpu(),
+            torch.tensor(seq_lens).cpu(),
+            block_size,
+            num_active_blocks,
+        )
 
-    output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
+        # Build attention masks
+        prior_mask, active_mask = (
+            BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+                query_lens, seq_lens, block_size=block_size))
+        prior_mask_padded = F.pad(
+            prior_mask,
+            (
+                0,
+                context_kv_len - prior_mask.shape[1],
+                0,
+                max_num_queries - prior_mask.shape[0],
+            ),
+            "constant",
+            0,
+        ).bool()
+        active_mask_padded = F.pad(
+            active_mask,
+            (
+                0,
+                max_num_queries - active_mask.shape[1],
+                0,
+                max_num_queries - active_mask.shape[0],
+            ),
+            "constant",
+            0,
+        ).bool()
+        attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
+                                 dim=1)
 
-    num_actual_tokens = sum(query_lens)
-    # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
-    output_nki = output_nki.cpu().permute(0, 2, 1, 3)
-    output_nki = output_nki[0, :num_actual_tokens, :, :]
-    output_ref_padded = F.pad(
-        output_ref,
-        (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
-        "constant",
-        0,
-    )
-    output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :]
+        attn_mask = reorder_context_mask(attn_mask, large_tile_size,
+                                         block_size)
 
-    torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
+        input_args = (
+            query.to(device=device),
+            k.to(device=device),
+            v.to(device=device),
+            k_cache.to(device=device),
+            v_cache.to(device=device),
+            active_block_table.to(device=device),
+            attn_mask.to(device=device),
+        )
+        input_kwargs = dict(
+            n_kv_head=num_kv_heads,
+            head_size=head_size,
+            mixed_precision=mixed_precision,
+            LARGE_TILE_SZ=large_tile_size,
+        )
+
+        output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
+
+        num_actual_tokens = sum(query_lens)
+        # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
+        output_nki = output_nki.cpu().permute(0, 2, 1, 3)
+        output_nki = output_nki[0, :num_actual_tokens, :, :]
+        output_ref_padded = F.pad(
+            output_ref,
+            (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
+            "constant",
+            0,
+        )
+        output_ref = output_ref_padded.transpose(
+            0, 1)[0, :num_actual_tokens, :, :]
+
+        torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 3be248f5aca45..9d6872e0e0772 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import pytest
 import torch
 
-from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import get_attn_backend
-from vllm.utils import STR_INVALID_VAL
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL
 
 
 def test_platform_plugins():
@@ -25,8 +25,9 @@ def test_platform_plugins():
         f" is loaded. The first import:\n{_init_trace}")
 
 
-def test_oot_attention_backend(monkeypatch):
+def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
     # ignore the backend env variable if it is set
-    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
-    backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
-    assert backend.get_name() == "Dummy_Backend"
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
+        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
+        assert backend.get_name() == "Dummy_Backend"
diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py
index 98981a81e909c..7abf5066a4133 100644
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -22,43 +22,47 @@ class DummyV1Scheduler(V1Scheduler):
         raise Exception("Exception raised by DummyV1Scheduler")
 
 
-def test_scheduler_plugins_v0(monkeypatch):
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-    with pytest.raises(Exception) as exception_info:
+def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with pytest.raises(Exception) as exception_info:
 
-        engine_args = EngineArgs(
-            model="facebook/opt-125m",
-            enforce_eager=True,  # reduce test time
-            scheduler_cls=DummyV0Scheduler,
-        )
+            engine_args = EngineArgs(
+                model="facebook/opt-125m",
+                enforce_eager=True,  # reduce test time
+                scheduler_cls=DummyV0Scheduler,
+            )
 
-        engine = LLMEngine.from_engine_args(engine_args=engine_args)
+            engine = LLMEngine.from_engine_args(engine_args=engine_args)
 
-        sampling_params = SamplingParams(max_tokens=1)
-        engine.add_request("0", "foo", sampling_params)
-        engine.step()
+            sampling_params = SamplingParams(max_tokens=1)
+            engine.add_request("0", "foo", sampling_params)
+            engine.step()
 
-    assert str(exception_info.value) == "Exception raised by DummyV0Scheduler"
+        assert str(
+            exception_info.value) == "Exception raised by DummyV0Scheduler"
 
 
-def test_scheduler_plugins_v1(monkeypatch):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    # Explicitly turn off engine multiprocessing so that the scheduler runs in
-    # this process
-    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        # Explicitly turn off engine multiprocessing so
+        # that the scheduler runs in this process
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
-    with pytest.raises(Exception) as exception_info:
+        with pytest.raises(Exception) as exception_info:
 
-        engine_args = EngineArgs(
-            model="facebook/opt-125m",
-            enforce_eager=True,  # reduce test time
-            scheduler_cls=DummyV1Scheduler,
-        )
+            engine_args = EngineArgs(
+                model="facebook/opt-125m",
+                enforce_eager=True,  # reduce test time
+                scheduler_cls=DummyV1Scheduler,
+            )
 
-        engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
+            engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
 
-        sampling_params = SamplingParams(max_tokens=1)
-        engine.add_request("0", "foo", sampling_params)
-        engine.step()
+            sampling_params = SamplingParams(max_tokens=1)
+            engine.add_request("0", "foo", sampling_params)
+            engine.step()
 
-    assert str(exception_info.value) == "Exception raised by DummyV1Scheduler"
+        assert str(
+            exception_info.value) == "Exception raised by DummyV1Scheduler"
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 7a4bc7aecc0f4..607b6c43e02e2 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -4,25 +4,29 @@
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 """
 
+from __future__ import annotations
+
 import pytest
 
 from tests.conftest import VllmRunner
 from tests.core.utils import SchedulerProxy, create_dummy_prompt
-from tests.kernels.utils import override_backend_env_variable
 from vllm import SamplingParams, TokensPrompt
 from vllm.core.scheduler import Scheduler
 from vllm.engine.llm_engine import LLMEngine
 from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR
 
 from ..models.utils import check_outputs_equal
 
 
 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
     """
     This module relies on V0 internals, so set VLLM_USE_V1=0.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
 
 
 MODELS = [
@@ -56,7 +60,7 @@ def test_mixed_requests(
     cached_position: int,
     enable_chunked_prefill: bool,
     block_size: int,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """
     Test the case when some sequences have the prefix cache hit
@@ -67,72 +71,77 @@ def test_mixed_requests(
         pytest.skip("Flashinfer does not support ROCm/HIP.")
     if backend == "XFORMERS" and current_platform.is_rocm():
         pytest.skip("Xformers does not support ROCm/HIP.")
-    override_backend_env_variable(monkeypatch, backend)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, backend)
 
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    cached_prompt = example_prompts[cached_position]
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enable_prefix_caching=True,
-            enable_chunked_prefill=enable_chunked_prefill,
-            block_size=block_size,
-    ) as vllm_model:
-        # Run the first prompt so the cache is populated
-        vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
+        cached_prompt = example_prompts[cached_position]
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enable_prefix_caching=True,
+                enable_chunked_prefill=enable_chunked_prefill,
+                block_size=block_size,
+        ) as vllm_model:
+            # Run the first prompt so the cache is populated
+            vllm_outputs = vllm_model.generate_greedy([cached_prompt],
+                                                      max_tokens)
 
-        # Run all the promopts
-        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
-        req_outputs = vllm_model.model.generate(example_prompts, greedy_params)
+            # Run all the promopts
+            greedy_params = SamplingParams(temperature=0.0,
+                                           max_tokens=max_tokens)
+            req_outputs = vllm_model.model.generate(example_prompts,
+                                                    greedy_params)
 
-        # Verify number of cached tokens
-        for i in range(len(req_outputs)):
-            if i == cached_position:
-                expected_num_cached_tokens = (
-                    len(req_outputs[i].prompt_token_ids) //
-                    block_size) * block_size
-            else:
-                expected_num_cached_tokens = 0
-            assert (
-                req_outputs[i].num_cached_tokens == expected_num_cached_tokens)
+            # Verify number of cached tokens
+            for i in range(len(req_outputs)):
+                if i == cached_position:
+                    expected_num_cached_tokens = (
+                        len(req_outputs[i].prompt_token_ids) //
+                        block_size) * block_size
+                else:
+                    expected_num_cached_tokens = 0
+                assert (req_outputs[i].num_cached_tokens ==
+                        expected_num_cached_tokens)
 
-        vllm_outputs = [(
-            output.prompt_token_ids + list(output.outputs[0].token_ids),
-            output.prompt + output.outputs[0].text,
-        ) for output in req_outputs]
+            vllm_outputs = [(
+                output.prompt_token_ids + list(output.outputs[0].token_ids),
+                output.prompt + output.outputs[0].text,
+            ) for output in req_outputs]
 
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
 
 
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
 def test_unstable_prompt_sequence(
     vllm_runner,
     backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
 
     if backend == "FLASHINFER" and current_platform.is_rocm():
         pytest.skip("Flashinfer does not support ROCm/HIP.")
     if backend == "XFORMERS" and current_platform.is_rocm():
         pytest.skip("Xformers does not support ROCm/HIP.")
-    override_backend_env_variable(monkeypatch, backend)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, backend)
 
-    with vllm_runner(
-            "Qwen/Qwen2.5-0.5B-Instruct",
-            enable_chunked_prefill=True,
-            enable_prefix_caching=True,
-            max_model_len=4096,
-    ) as vllm_model:
-        for prompt in UNSTABLE_PROMPT_SEQUENCE:
-            vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
-                                SamplingParams(max_tokens=1))
+        with vllm_runner(
+                "Qwen/Qwen2.5-0.5B-Instruct",
+                enable_chunked_prefill=True,
+                enable_prefix_caching=True,
+                max_model_len=4096,
+        ) as vllm_model:
+            for prompt in UNSTABLE_PROMPT_SEQUENCE:
+                vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
+                                    SamplingParams(max_tokens=1))
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/test_regression.py b/tests/test_regression.py
index b54dc6af3e9a6..8c9d4a91c73be 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -56,12 +56,11 @@ def test_gc():
     assert allocated < 50 * 1024 * 1024
 
 
-def test_model_from_modelscope(monkeypatch):
+def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
     # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
-    MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat"
-    monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True")
-    try:
-        llm = LLM(model=MODELSCOPE_MODEL_NAME)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_MODELSCOPE", "True")
+        llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
 
         prompts = [
             "Hello, my name is",
@@ -73,10 +72,3 @@ def test_model_from_modelscope(monkeypatch):
 
         outputs = llm.generate(prompts, sampling_params)
         assert len(outputs) == 4
-    finally:
-        monkeypatch.delenv("VLLM_USE_MODELSCOPE", raising=False)
-
-
-if __name__ == "__main__":
-    import pytest
-    pytest.main([__file__])
diff --git a/tests/test_utils.py b/tests/test_utils.py
index dcca7d5965e9e..ae4fddd046d45 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+# ruff: noqa
 
 import asyncio
-import os
 import socket
 from collections.abc import AsyncIterator
 from unittest.mock import patch
@@ -112,16 +112,16 @@ def test_deprecate_kwargs_additional_message():
         dummy(old_arg=1)
 
 
-def test_get_open_port():
-    os.environ["VLLM_PORT"] = "5678"
-    # make sure we can get multiple ports, even if the env var is set
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
-        s1.bind(("localhost", get_open_port()))
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
-            s2.bind(("localhost", get_open_port()))
-            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
-                s3.bind(("localhost", get_open_port()))
-    os.environ.pop("VLLM_PORT")
+def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PORT", "5678")
+        # make sure we can get multiple ports, even if the env var is set
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
+            s1.bind(("localhost", get_open_port()))
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
+                s2.bind(("localhost", get_open_port()))
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
+                    s3.bind(("localhost", get_open_port()))
 
 
 # Tests for FlexibleArgumentParser
@@ -366,31 +366,32 @@ def test_bind_kv_cache_non_attention():
     assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
 
 
-def test_bind_kv_cache_encoder_decoder(monkeypatch):
+def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
     # V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
 
-    from vllm.attention import Attention, AttentionType
+        from vllm.attention import Attention, AttentionType
 
-    # example from bart
-    ctx = {
-        'encoder.layers.0.self_attn.attn':
-            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
-        'decoder.layers.0.encoder_attn.attn':
-            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
-        'decoder.layers.0.self_attn.attn':
-            Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
-    }
+        # example from bart
+        ctx = {
+            'encoder.layers.0.self_attn.attn':
+                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
+            'decoder.layers.0.encoder_attn.attn':
+                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
+            'decoder.layers.0.self_attn.attn':
+                Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
+        }
 
-    kv_cache = [
-        torch.zeros((1, )),
-    ]
-    encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
+        kv_cache = [
+            torch.zeros((1, )),
+        ]
+        encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
 
-    bind_kv_cache(ctx, [kv_cache])
-    assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
-    assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
-    assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
+        bind_kv_cache(ctx, [kv_cache])
+        assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
+        assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
+        assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
 
 
 def test_bind_kv_cache_pp():
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
index e94bbd2877225..f7a59f054b61b 100644
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import os
+import pytest
 
 from vllm.config import CompilationLevel
 
@@ -9,16 +9,17 @@ from ..utils import compare_two_settings
 # --enforce-eager on TPU causes graph compilation
 # this times out default Health Check in the MQLLMEngine,
 # so we set the timeout here to 30s
-os.environ["VLLM_RPC_TIMEOUT"] = "30000"
 
 
-def test_custom_dispatcher():
-    compare_two_settings(
-        "google/gemma-2b",
-        arg1=[
-            "--enforce-eager",
-            f"-O{CompilationLevel.DYNAMO_ONCE}",
-        ],
-        arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
-        env1={},
-        env2={})
+def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_RPC_TIMEOUT", "30000")
+        compare_two_settings(
+            "google/gemma-2b",
+            arg1=[
+                "--enforce-eager",
+                f"-O{CompilationLevel.DYNAMO_ONCE}",
+            ],
+            arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
+            env1={},
+            env2={})
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 97149884497af..a781b8b563be1 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
+# ruff: noqa
+# type: ignore
+from __future__ import annotations
 
-import os
 import threading
 from collections.abc import Iterable
 from concurrent import futures
-from typing import Callable, Literal
+from typing import Callable, Generator, Literal
 
 import grpc
 import pytest
@@ -21,12 +23,14 @@ from vllm.tracing import SpanAttributes
 
 
 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
     """
     Since this module is V0 only, set VLLM_USE_V1=0 for
     all tests in the module.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
 
 
 FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
@@ -67,7 +71,7 @@ class FakeTraceService(TraceServiceServicer):
 
 
 @pytest.fixture
-def trace_service():
+def trace_service() -> Generator[FakeTraceService, None, None]:
     """Fixture to set up a fake gRPC trace service"""
     server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
     service = FakeTraceService()
@@ -80,136 +84,153 @@ def trace_service():
     server.stop(None)
 
 
-def test_traces(trace_service):
-    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
+def test_traces(
+    monkeypatch: pytest.MonkeyPatch,
+    trace_service: FakeTraceService,
+):
+    with monkeypatch.context() as m:
+        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
 
-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=256)
-    model = "facebook/opt-125m"
-    llm = LLM(
-        model=model,
-        otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-    )
-    prompts = ["This is a short prompt"]
-    outputs = llm.generate(prompts, sampling_params=sampling_params)
+        sampling_params = SamplingParams(
+            temperature=0.01,
+            top_p=0.1,
+            max_tokens=256,
+        )
+        model = "facebook/opt-125m"
+        llm = LLM(
+            model=model,
+            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+        )
+        prompts = ["This is a short prompt"]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
 
-    timeout = 5
-    if not trace_service.evt.wait(timeout):
-        raise TimeoutError(
-            f"The fake trace service didn't receive a trace within "
-            f"the {timeout} seconds timeout")
+        timeout = 5
+        if not trace_service.evt.wait(timeout):
+            raise TimeoutError(
+                f"The fake trace service didn't receive a trace within "
+                f"the {timeout} seconds timeout")
 
-    request = trace_service.request
-    assert len(request.resource_spans) == 1, (
-        f"Expected 1 resource span, "
-        f"but got {len(request.resource_spans)}")
-    assert len(request.resource_spans[0].scope_spans) == 1, (
-        f"Expected 1 scope span, "
-        f"but got {len(request.resource_spans[0].scope_spans)}")
-    assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
-        f"Expected 1 span, "
-        f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+        request = trace_service.request
+        assert len(request.resource_spans) == 1, (
+            f"Expected 1 resource span, "
+            f"but got {len(request.resource_spans)}")
+        assert len(request.resource_spans[0].scope_spans) == 1, (
+            f"Expected 1 scope span, "
+            f"but got {len(request.resource_spans[0].scope_spans)}")
+        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+            f"Expected 1 span, "
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
 
-    attributes = decode_attributes(
-        request.resource_spans[0].scope_spans[0].spans[0].attributes)
-    assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
-                          ) == sampling_params.temperature
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-    assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
-        outputs[0].prompt_token_ids)
-    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-    assert attributes.get(
-        SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
-    metrics = outputs[0].metrics
-    assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
-    ttft = metrics.first_token_time - metrics.arrival_time
-    assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
-    e2e_time = metrics.finished_time - metrics.arrival_time
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
-    assert metrics.scheduler_time > 0
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
-                          ) == metrics.scheduler_time
-    # Model forward and model execute should be none, since detailed traces is
-    # not enabled.
-    assert metrics.model_forward_time is None
-    assert metrics.model_execute_time is None
+        attributes = decode_attributes(
+            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                              ) == sampling_params.temperature
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+                              ) == sampling_params.max_tokens
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+                outputs[0].prompt_token_ids)
+        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        metrics = outputs[0].metrics
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
+                              ) == metrics.time_in_queue
+        ttft = metrics.first_token_time - metrics.arrival_time
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        e2e_time = metrics.finished_time - metrics.arrival_time
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
+        assert metrics.scheduler_time > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+                              ) == metrics.scheduler_time
+        # Model forward and model execute should be none, since detailed traces is
+        # not enabled.
+        assert metrics.model_forward_time is None
+        assert metrics.model_execute_time is None
 
 
-def test_traces_with_detailed_steps(trace_service):
-    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
+def test_traces_with_detailed_steps(
+    monkeypatch: pytest.MonkeyPatch,
+    trace_service: FakeTraceService,
+):
+    with monkeypatch.context() as m:
+        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
 
-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=256)
-    model = "facebook/opt-125m"
-    llm = LLM(
-        model=model,
-        otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-        collect_detailed_traces="all",
-    )
-    prompts = ["This is a short prompt"]
-    outputs = llm.generate(prompts, sampling_params=sampling_params)
+        sampling_params = SamplingParams(
+            temperature=0.01,
+            top_p=0.1,
+            max_tokens=256,
+        )
+        model = "facebook/opt-125m"
+        llm = LLM(
+            model=model,
+            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+            collect_detailed_traces="all",
+        )
+        prompts = ["This is a short prompt"]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
 
-    timeout = 5
-    if not trace_service.evt.wait(timeout):
-        raise TimeoutError(
-            f"The fake trace service didn't receive a trace within "
-            f"the {timeout} seconds timeout")
+        timeout = 5
+        if not trace_service.evt.wait(timeout):
+            raise TimeoutError(
+                f"The fake trace service didn't receive a trace within "
+                f"the {timeout} seconds timeout")
 
-    request = trace_service.request
-    assert len(request.resource_spans) == 1, (
-        f"Expected 1 resource span, "
-        f"but got {len(request.resource_spans)}")
-    assert len(request.resource_spans[0].scope_spans) == 1, (
-        f"Expected 1 scope span, "
-        f"but got {len(request.resource_spans[0].scope_spans)}")
-    assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
-        f"Expected 1 span, "
-        f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+        request = trace_service.request
+        assert len(request.resource_spans) == 1, (
+            f"Expected 1 resource span, "
+            f"but got {len(request.resource_spans)}")
+        assert len(request.resource_spans[0].scope_spans) == 1, (
+            f"Expected 1 scope span, "
+            f"but got {len(request.resource_spans[0].scope_spans)}")
+        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+            f"Expected 1 span, "
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
 
-    attributes = decode_attributes(
-        request.resource_spans[0].scope_spans[0].spans[0].attributes)
-    assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
-                          ) == sampling_params.temperature
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-    assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
-        outputs[0].prompt_token_ids)
-    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-    assert attributes.get(
-        SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
-    metrics = outputs[0].metrics
-    assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
-    ttft = metrics.first_token_time - metrics.arrival_time
-    assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
-    e2e_time = metrics.finished_time - metrics.arrival_time
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
-    assert metrics.scheduler_time > 0
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
-                          ) == metrics.scheduler_time
-    assert metrics.model_forward_time > 0
-    assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
-            metrics.model_forward_time / 1000)
-    assert metrics.model_execute_time > 0
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
-                          ) == metrics.model_execute_time
-    assert metrics.model_forward_time < 1000 * metrics.model_execute_time
+        attributes = decode_attributes(
+            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                              ) == sampling_params.temperature
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+                              ) == sampling_params.max_tokens
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+                outputs[0].prompt_token_ids)
+        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        metrics = outputs[0].metrics
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
+                              ) == metrics.time_in_queue
+        ttft = metrics.first_token_time - metrics.arrival_time
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        e2e_time = metrics.finished_time - metrics.arrival_time
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
+        assert metrics.scheduler_time > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+                              ) == metrics.scheduler_time
+        assert metrics.model_forward_time > 0
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD
+        ) == pytest.approx(metrics.model_forward_time / 1000)
+        assert metrics.model_execute_time > 0
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
+        ) == metrics.model_execute_time
+        assert metrics.model_forward_time < 1000 * metrics.model_execute_time
diff --git a/tests/utils.py b/tests/utils.py
index fc19c8d031b16..06ba8a2421c16 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -566,6 +566,7 @@ def init_test_distributed_environment(
 
 
 def multi_process_parallel(
+    monkeypatch: pytest.MonkeyPatch,
     tp_size: int,
     pp_size: int,
     test_target: Any,
@@ -582,7 +583,13 @@ def multi_process_parallel(
     refs = []
     for rank in range(tp_size * pp_size):
         refs.append(
-            test_target.remote(tp_size, pp_size, rank, distributed_init_port))
+            test_target.remote(
+                monkeypatch,
+                tp_size,
+                pp_size,
+                rank,
+                distributed_init_port,
+            ), )
     ray.get(refs)
 
     ray.shutdown()
@@ -700,7 +707,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
     """
     Get a pytest mark, which skips the test if the GPU doesn't meet
     a minimum memory requirement in GB.
-    
+
     This can be leveraged via `@large_gpu_test` to skip tests in environments
     without enough resources, or called when filtering tests to run directly.
     """
diff --git a/tests/v1/e2e/test_ngram_spec_decode.py b/tests/v1/e2e/test_ngram_spec_decode.py
index 519a74cab84bc..6cca324514565 100644
--- a/tests/v1/e2e/test_ngram_spec_decode.py
+++ b/tests/v1/e2e/test_ngram_spec_decode.py
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
 import random
+from typing import Any
 
 import pytest
 
@@ -50,8 +53,12 @@ def model_name():
     return "meta-llama/Meta-Llama-3-8B-Instruct"
 
 
-def test_ngram_correctness(monkeypatch, test_prompts, sampling_config,
-                           model_name):
+def test_ngram_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_name: str,
+):
     '''
     Compare the outputs of a original LLM and a speculative LLM
     should be the same when using ngram speculative decoding.
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 5b9725d59ddc5..0ff804976ada6 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -80,9 +80,11 @@ async def generate(engine: AsyncLLM,
                          [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
                           (VISION_ENGINE_ARGS, VISION_PROMPT)])
 @pytest.mark.asyncio
-async def test_load(monkeypatch, output_kind: RequestOutputKind,
-                    engine_args_and_prompt: tuple[AsyncEngineArgs,
-                                                  PromptType]):
+async def test_load(
+    monkeypatch: pytest.MonkeyPatch,
+    output_kind: RequestOutputKind,
+    engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType],
+):
     # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
     # so that in the future when we switch, we don't have to change all the
     # tests.
@@ -126,7 +128,8 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind,
                          [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
                           (VISION_ENGINE_ARGS, VISION_PROMPT)])
 @pytest.mark.asyncio
-async def test_abort(monkeypatch, output_kind: RequestOutputKind,
+async def test_abort(monkeypatch: pytest.MonkeyPatch,
+                     output_kind: RequestOutputKind,
                      engine_args_and_prompt: tuple[AsyncEngineArgs,
                                                    PromptType]):
 
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 5fdbcf5b99636..2ec4f7e034af8 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -45,7 +45,7 @@ def make_request() -> EngineCoreRequest:
 
 
 @fork_new_process_for_each_test
-def test_engine_core(monkeypatch):
+def test_engine_core(monkeypatch: pytest.MonkeyPatch):
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
@@ -159,10 +159,10 @@ def test_engine_core(monkeypatch):
 
 
 @fork_new_process_for_each_test
-def test_engine_core_advanced_sampling(monkeypatch):
+def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
     """
-    A basic end-to-end test to verify that the engine functions correctly 
-    when additional sampling parameters, such as top_p, min_tokens, and 
+    A basic end-to-end test to verify that the engine functions correctly
+    when additional sampling parameters, such as top_p, min_tokens, and
     presence_penalty, are set.
     """
     with monkeypatch.context() as m:
@@ -209,7 +209,7 @@ def test_engine_core_advanced_sampling(monkeypatch):
 
 
 @fork_new_process_for_each_test
-def test_engine_core_concurrent_batches(monkeypatch):
+def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
     """
     Test that the engine can handle multiple concurrent batches.
     """
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index e646ccbd46030..004b4dc82f4d9 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -90,7 +90,8 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
 
 @fork_new_process_for_each_test
 @pytest.mark.parametrize("multiprocessing_mode", [True, False])
-def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
+def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
+                            multiprocessing_mode: bool):
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
@@ -175,7 +176,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
 
 
 @pytest.mark.asyncio(loop_scope="function")
-async def test_engine_core_client_asyncio(monkeypatch):
+async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index e763aa2c86998..3800cb392fbad 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -57,7 +57,7 @@ def _repeat_logprob_config(
     logprob_prompt_logprob_list: BatchLogprobsSpecType,
 ) -> BatchLogprobsSpecType:
     """Ensure each test prompt has a logprob config.
-    
+
     A logprob config specifies the optional (i.e.
     may-be-`None`) number of sample logprobs and
     the optional number of prompt logprobs.
@@ -80,7 +80,7 @@ def _repeat_logprob_config(
                             (optional num sample logprob,
                              optional num prompt logprob)
                              tuples
-    
+
     Returns:
       list of
       (optional num sample logprob,optional num prompt logprob)
@@ -255,14 +255,12 @@ def _run_and_validate(
                          [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT])
 @pytest.mark.parametrize("temperature", [0.0, 2.0])
 def test_get_logprobs_and_prompt_logprobs(
-    hf_model,
-    vllm_model,
-    batch_logprobs_composition: BatchLogprobsComposition,
-    temperature: float,
-    example_prompts,
-) -> None:
+        hf_model, vllm_model,
+        batch_logprobs_composition: BatchLogprobsComposition,
+        temperature: float, example_prompts: list[str],
+        monkeypatch: pytest.MonkeyPatch) -> None:
     """Test V1 Engine logprobs & prompt logprobs
-    
+
     Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
     settings and validate that
     * The generated logprobs and prompt logprobs are consistent with the
@@ -279,7 +277,7 @@ def test_get_logprobs_and_prompt_logprobs(
 
     To save time, only test one APC-enabled scenario
     (sample & prompt logprobs enabled, temperature>0.0).
-    
+
     Args:
       hf_model: HuggingFace reference model fixture
       vllm_model: vLLM model fixture
@@ -287,128 +285,140 @@ def test_get_logprobs_and_prompt_logprobs(
       temperature: "temperature" sampling parameter
       example_prompts: example prompt fixture
     """
-    do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
-    if do_apc and (temperature < 2.0
-                   or batch_logprobs_composition != SAMPLE_PROMPT):
-        # Skip some test-cases to save time.
-        pytest.skip()
-    test_prompts = example_prompts
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
+        if do_apc and (temperature < 2.0
+                       or batch_logprobs_composition != SAMPLE_PROMPT):
+            # Skip some test-cases to save time.
+            pytest.skip()
+        test_prompts = example_prompts
 
-    max_tokens = 5
-    hf_outputs = hf_model.generate_greedy(
-        test_prompts,
-        max_tokens=max_tokens,
-    )
-    hf_logprobs = hf_model.generate_greedy_logprobs(
-        test_prompts,
-        max_tokens=max_tokens,
-    )
-
-    # Batch has mixed sample params
-    # (different logprobs/prompt logprobs combos)
-    logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
-
-    # Ensure that each test prompt has a logprob config for testing
-    logprob_prompt_logprob_list = _repeat_logprob_config(
-        test_prompts, logprob_prompt_logprob_list)
-    # Generate SamplingParams
-    vllm_sampling_params = [
-        SamplingParams(max_tokens=max_tokens,
-                       logprobs=num_lp,
-                       prompt_logprobs=num_plp,
-                       temperature=temperature,
-                       seed=1984)
-        for num_lp, num_plp in logprob_prompt_logprob_list
-    ]
-    for _ in range(2 if do_apc else 1):
-        _run_and_validate(
-            vllm_model=vllm_model,
-            test_prompts=test_prompts,
-            vllm_sampling_params=vllm_sampling_params,
-            hf_logprobs=hf_logprobs,
-            hf_outputs=hf_outputs,
-            logprob_prompt_logprob_list=logprob_prompt_logprob_list,
-            temperature=temperature,
+        max_tokens = 5
+        hf_outputs = hf_model.generate_greedy(
+            test_prompts,
             max_tokens=max_tokens,
-            do_apc=do_apc)
+        )
+        hf_logprobs = hf_model.generate_greedy_logprobs(
+            test_prompts,
+            max_tokens=max_tokens,
+        )
+
+        # Batch has mixed sample params
+        # (different logprobs/prompt logprobs combos)
+        logprob_prompt_logprob_list = get_test_batch(
+            batch_logprobs_composition)
+
+        # Ensure that each test prompt has a logprob config for testing
+        logprob_prompt_logprob_list = _repeat_logprob_config(
+            test_prompts, logprob_prompt_logprob_list)
+        # Generate SamplingParams
+        vllm_sampling_params = [
+            SamplingParams(max_tokens=max_tokens,
+                           logprobs=num_lp,
+                           prompt_logprobs=num_plp,
+                           temperature=temperature,
+                           seed=1984)
+            for num_lp, num_plp in logprob_prompt_logprob_list
+        ]
+        for _ in range(2 if do_apc else 1):
+            _run_and_validate(
+                vllm_model=vllm_model,
+                test_prompts=test_prompts,
+                vllm_sampling_params=vllm_sampling_params,
+                hf_logprobs=hf_logprobs,
+                hf_outputs=hf_outputs,
+                logprob_prompt_logprob_list=logprob_prompt_logprob_list,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                do_apc=do_apc)
 
 
-def test_max_logprobs():
+def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
     """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
-    
     Should also fail for `prompt_logprobs > max_logprobs`
-
     APC should not matter as this test checks basic request validation.
-    
-    Args:
-      monkeypatch
     """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
 
-    runner = VllmRunner("facebook/opt-125m",
-                        max_logprobs=1,
-                        enable_prefix_caching=False,
-                        max_model_len=256)
-    vllm_sampling_params = SamplingParams(logprobs=1)
-    # should pass
-    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+        runner = VllmRunner("facebook/opt-125m",
+                            max_logprobs=1,
+                            enable_prefix_caching=False,
+                            max_model_len=256)
+        vllm_sampling_params = SamplingParams(logprobs=1)
+        # should pass
+        runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
 
-    bad_sampling_params = SamplingParams(logprobs=2)
-    with pytest.raises(ValueError):
-        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+        bad_sampling_params = SamplingParams(logprobs=2)
+        with pytest.raises(ValueError):
+            runner.generate(["Hello world"],
+                            sampling_params=bad_sampling_params)
 
 
-def test_none_logprobs(vllm_model, example_prompts):
+def test_none_logprobs(vllm_model, example_prompts,
+                       monkeypatch: pytest.MonkeyPatch):
     """Engine should return `logprobs` and `prompt_logprobs` as `None`
-    
+
     Args:
       vllm_model: vLLM model fixture
       example_prompts: list of example prompts (test fixture)
     """
-    max_tokens = 5
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        max_tokens = 5
 
-    sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
-                                                   logprobs=None,
-                                                   prompt_logprobs=None,
-                                                   temperature=0.0)
-    results_logprobs_none = vllm_model.model.generate(
-        example_prompts, sampling_params=sampling_params_logprobs_none)
+        sampling_params_logprobs_none = SamplingParams(
+            max_tokens=max_tokens,
+            logprobs=None,
+            prompt_logprobs=None,
+            temperature=0.0,
+        )
+        results_logprobs_none = vllm_model.model.generate(
+            example_prompts,
+            sampling_params=sampling_params_logprobs_none,
+        )
 
-    for i in range(len(results_logprobs_none)):
-        # Check sample logprobs are None
-        assert results_logprobs_none[i].outputs[0].logprobs is None
-        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
-        # Check prompt logprobs are None
-        assert results_logprobs_none[i].prompt_logprobs is None
+        for i in range(len(results_logprobs_none)):
+            # Check sample logprobs are None
+            assert results_logprobs_none[i].outputs[0].logprobs is None
+            assert results_logprobs_none[i].outputs[
+                0].cumulative_logprob is None
+            # Check prompt logprobs are None
+            assert results_logprobs_none[i].prompt_logprobs is None
 
 
-def test_zero_logprobs(vllm_model, example_prompts):
+def test_zero_logprobs(vllm_model, example_prompts,
+                       monkeypatch: pytest.MonkeyPatch):
     """Engine should return sampled token and prompt token logprobs
-    
+
     Args:
       vllm_model: vLLM model fixture
       example_prompts: list of example prompts (test fixture)
     """
-    max_tokens = 5
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        max_tokens = 5
 
-    sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
-                                                   logprobs=0,
-                                                   prompt_logprobs=0,
-                                                   temperature=0.0)
-    results_logprobs_zero = vllm_model.model.generate(
-        example_prompts, sampling_params=sampling_params_logprobs_zero)
+        sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
+                                                       logprobs=0,
+                                                       prompt_logprobs=0,
+                                                       temperature=0.0)
+        results_logprobs_zero = vllm_model.model.generate(
+            example_prompts, sampling_params=sampling_params_logprobs_zero)
 
-    for i in range(len(results_logprobs_zero)):
-        # Check that there is one sample logprob dict for each
-        # sample token
-        logprobs = results_logprobs_zero[i].outputs[0].logprobs
-        prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
-        sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
-        prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
-        assert logprobs is not None
-        assert len(sampled_token_ids) == len(logprobs)
-        assert results_logprobs_zero[i].outputs[
-            0].cumulative_logprob is not None
-        # Check that there is one prompt logprob dict for each
-        # prompt token
-        assert prompt_logprobs is not None
-        assert len(prompt_token_ids) == len(prompt_logprobs)
+        for i in range(len(results_logprobs_zero)):
+            # Check that there is one sample logprob dict for each
+            # sample token
+            logprobs = results_logprobs_zero[i].outputs[0].logprobs
+            prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
+            sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
+            prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
+            assert logprobs is not None
+            assert len(sampled_token_ids) == len(logprobs)
+            assert results_logprobs_zero[i].outputs[
+                0].cumulative_logprob is not None
+            # Check that there is one prompt logprob dict for each
+            # prompt token
+            assert prompt_logprobs is not None
+            assert len(prompt_token_ids) == len(prompt_logprobs)
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 0309f545ea49e..241f49e4faea8 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -3,11 +3,16 @@
 
 Run `pytest tests/v1/tpu/test_basic.py`.
 """
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 import pytest
 
 from vllm.platforms import current_platform
 
-from ...conftest import VllmRunner
+if TYPE_CHECKING:
+    from tests.conftest import VllmRunner
 
 MODELS = [
     # "Qwen/Qwen2-7B-Instruct",
@@ -28,7 +33,8 @@ TENSOR_PARALLEL_SIZES = [1]
 @pytest.mark.parametrize("enforce_eager", [True])
 @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
 def test_models(
-    monkeypatch,
+    vllm_runner: type[VllmRunner],
+    monkeypatch: pytest.MonkeyPatch,
     model: str,
     max_tokens: int,
     enforce_eager: bool,
@@ -41,7 +47,7 @@ def test_models(
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        with VllmRunner(
+        with vllm_runner(
                 model,
                 max_model_len=8192,
                 enforce_eager=enforce_eager,
@@ -50,5 +56,5 @@ def test_models(
                 tensor_parallel_size=tensor_parallel_size) as vllm_model:
             vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                       max_tokens)
-    output = vllm_outputs[0][1]
-    assert "1024" in output
+        output = vllm_outputs[0][1]
+        assert "1024" in output