Merge branch 'main' into mamba_tests

2026-05-11 06:53:36 +08:00 · 2025-03-17 13:49:56 +00:00 · 2025-03-17 13:49:56 +00:00 · ac08d45200
commit ac08d45200
parent a5d29e9ee1 d20b0c139c
209 changed files with 17538 additions and 4588 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -198,6 +198,7 @@ steps:
  commands:
    # split the test to avoid interference
    - pytest -v -s v1/core
    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/engine
    - pytest -v -s v1/sample
    - pytest -v -s v1/worker
@ -225,10 +226,13 @@ steps:
    - python3 offline_inference/basic/chat.py
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/vision_language.py
+    - python3 offline_inference/audio_language.py --seed 0
-    - python3 offline_inference/vision_language_multi_image.py
+    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_embedding.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder.py
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
@ -530,7 +534,7 @@ steps:
  # TODO: investigate and fix
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
 - label: Plugin Tests (2 GPUs) # 40min
  working_dir: "/vllm-workspace/tests"
--- a/README.md
+++ b/README.md
@ -13,18 +13,9 @@ Easy, fast, and cheap LLM serving for everyone
 | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 ---
 We’re excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**!  
 Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend).
 👉 **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion!  
 ---
 *Latest News* 🔥
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -82,10 +82,10 @@ Then run the benchmarking script
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
 NUM_PROMPTS=10
-BACKEND="openai-chat"
+BACKEND="vllm"
 DATASET_NAME="sharegpt"
 DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
-python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
+python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
 ```
 If successful, you will see the following output
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -14,7 +14,8 @@ from tqdm.asyncio import tqdm
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                          PreTrainedTokenizerFast)
-from vllm.model_executor.model_loader.weight_utils import get_lock
+# NOTE(simon): do not import vLLM here so the benchmark script
 # can run without vLLM installed.
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@ -427,6 +428,8 @@ def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download
        from vllm.model_executor.model_loader.weight_utils import get_lock
        # Use file lock to prevent multiple processes from
        # downloading the same model weights at the same time.
        with get_lock(pretrained_model_name_or_path):
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -684,6 +684,15 @@ def main(args: argparse.Namespace):
                        "Invalid metadata format. Please use KEY=VALUE format."
                    )
        if not args.save_detailed:
            # Remove fields with too many data points
            for field in [
                    "input_lens", "output_lens", "ttfts", "itls",
                    "generated_texts", "errors"
            ]:
                if field in result_json:
                    del result_json[field]
        # Traffic
        result_json["request_rate"] = (args.request_rate if args.request_rate
                                       < float("inf") else "inf")
@ -828,6 +837,12 @@ if __name__ == "__main__":
        action="store_true",
        help="Specify to save benchmark results to a json file",
    )
    parser.add_argument(
        "--save-detailed",
        action="store_true",
        help="When saving the results, whether to include per request "
        "information such as response, error, ttfs, tpots, etc.",
    )
    parser.add_argument(
        "--metadata",
        metavar="KEY=VALUE",
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -127,7 +127,7 @@ __device__ __forceinline__ T from_float(const float& inp) {
 template <typename T>
 __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
-  union tmpcvt {
+  [[maybe_unused]] union tmpcvt {
    uint16_t u;
    _Float16 f;
    __hip_bfloat16 b;
@ -160,7 +160,7 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
 template <typename T>
 __device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
                                        const _B16x4& inp2) {
-  union tmpcvt {
+  [[maybe_unused]] union tmpcvt {
    uint16_t u;
    _Float16 f;
    __hip_bfloat16 b;
@ -1273,9 +1273,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
  const int seq_idx = blockIdx.y;
  const int context_len = context_lens[seq_idx];
  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
-  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
  const int warpid = threadIdx.x / WARP_SIZE;
-  const int laneid = threadIdx.x % WARP_SIZE;
+  [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
  __shared__ float shared_global_exp_sum;
  // max num partitions supported is warp_size * NPAR_LOOPS
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@ -131,6 +131,8 @@ Building from source requires a lot of compilation. If you are building from sou
 For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
 As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
 When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built.
 [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
 The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
 :::
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -7,11 +7,13 @@ For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
 import os
 from dataclasses import asdict
 from typing import NamedTuple, Optional
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.lora.request import LoRARequest
 from vllm.utils import FlexibleArgumentParser
@ -23,21 +25,31 @@ question_per_audio_count = {
    2: "What sport and what nursery rhyme are referenced?"
 }
 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompt: str
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
 # lower-end GPUs.
 # Unless specified, these settings have been tested to work on a single L4.
 # MiniCPM-O
-def run_minicpmo(question: str, audio_count: int):
+def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
    model_name = "openbmb/MiniCPM-o-2_6"
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
-    llm = LLM(model=model_name,
+    engine_args = EngineArgs(
-              trust_remote_code=True,
+        model=model_name,
-              max_model_len=4096,
+        trust_remote_code=True,
-              max_num_seqs=5,
+        max_model_len=4096,
-              limit_mm_per_prompt={"audio": audio_count})
+        max_num_seqs=5,
        limit_mm_per_prompt={"audio": audio_count},
    )
    stop_tokens = ['<|im_end|>', '<|endoftext|>']
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
@ -52,11 +64,16 @@ def run_minicpmo(question: str, audio_count: int):
                                           tokenize=False,
                                           add_generation_prompt=True,
                                           chat_template=audio_chat_template)
-    return llm, prompt, stop_token_ids
+
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
    )
 # Phi-4-multimodal-instruct
-def run_phi4mm(questions: str, audio_count: int):
+def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process audio inputs.
@ -67,9 +84,9 @@ def run_phi4mm(questions: str, audio_count: int):
    speech_lora_path = os.path.join(model_path, "speech-lora")
    placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
-    prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>"
+    prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_path,
        trust_remote_code=True,
        max_model_len=4096,
@ -79,24 +96,24 @@ def run_phi4mm(questions: str, audio_count: int):
        lora_extra_vocab_size=0,
        limit_mm_per_prompt={"audio": audio_count},
    )
    lora_request = LoRARequest("speech", 1, speech_lora_path)
    # To maintain code compatibility in this script, we add LoRA here.
    llm.llm_engine.add_lora(lora_request=lora_request)
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
-    stop_token_ids = None
+    return ModelRequestData(
-    return llm, prompts, stop_token_ids
+        engine_args=engine_args,
        prompt=prompts,
        lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
    )
 # Qwen2-Audio
-def run_qwen2_audio(question: str, audio_count: int):
+def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
    model_name = "Qwen/Qwen2-Audio-7B-Instruct"
-    llm = LLM(model=model_name,
+    engine_args = EngineArgs(
-              max_model_len=4096,
+        model=model_name,
-              max_num_seqs=5,
+        max_model_len=4096,
-              limit_mm_per_prompt={"audio": audio_count})
+        max_num_seqs=5,
        limit_mm_per_prompt={"audio": audio_count},
    )
    audio_in_prompt = "".join([
        f"Audio {idx+1}: "
@ -107,12 +124,15 @@ def run_qwen2_audio(question: str, audio_count: int):
              "<|im_start|>user\n"
              f"{audio_in_prompt}{question}<|im_end|>\n"
              "<|im_start|>assistant\n")
-    stop_token_ids = None
+
-    return llm, prompt, stop_token_ids
+    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
    )
 # Ultravox 0.5-1B
-def run_ultravox(question: str, audio_count: int):
+def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
    model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
@ -124,29 +144,39 @@ def run_ultravox(question: str, audio_count: int):
                                           tokenize=False,
                                           add_generation_prompt=True)
-    llm = LLM(model=model_name,
+    engine_args = EngineArgs(
-              max_model_len=4096,
+        model=model_name,
-              max_num_seqs=5,
+        max_model_len=4096,
-              trust_remote_code=True,
+        max_num_seqs=5,
-              limit_mm_per_prompt={"audio": audio_count})
+        trust_remote_code=True,
-    stop_token_ids = None
+        limit_mm_per_prompt={"audio": audio_count},
-    return llm, prompt, stop_token_ids
+    )
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
    )
 # Whisper
-def run_whisper(question: str, audio_count: int):
+def run_whisper(question: str, audio_count: int) -> ModelRequestData:
    assert audio_count == 1, (
        "Whisper only support single audio input per prompt")
    model_name = "openai/whisper-large-v3-turbo"
    prompt = "<|startoftranscript|>"
-    llm = LLM(model=model_name,
+    engine_args = EngineArgs(
-              max_model_len=448,
+        model=model_name,
-              max_num_seqs=5,
+        max_model_len=448,
-              limit_mm_per_prompt={"audio": audio_count})
+        max_num_seqs=5,
-    stop_token_ids = None
+        limit_mm_per_prompt={"audio": audio_count},
-    return llm, prompt, stop_token_ids
+    )
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
    )
 model_example_map = {
@ -164,14 +194,24 @@ def main(args):
        raise ValueError(f"Model type {model} is not supported.")
    audio_count = args.num_audios
-    llm, prompt, stop_token_ids = model_example_map[model](
+    req_data = model_example_map[model](question_per_audio_count[audio_count],
-        question_per_audio_count[audio_count], audio_count)
+                                        audio_count)
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)
    # To maintain code compatibility in this script, we add LoRA here.
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    if req_data.lora_requests:
        for lora_request in req_data.lora_requests:
            llm.llm_engine.add_lora(lora_request=lora_request)
    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
-                                     stop_token_ids=stop_token_ids)
+                                     stop_token_ids=req_data.stop_token_ids)
    mm_data = {}
    if audio_count > 0:
@ -183,7 +223,7 @@ def main(args):
        }
    assert args.num_prompts > 0
-    inputs = {"prompt": prompt, "multi_modal_data": mm_data}
+    inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
    if args.num_prompts > 1:
        # Batch inference
        inputs = [inputs] * args.num_prompts
@ -214,6 +254,10 @@ if __name__ == "__main__":
                        default=1,
                        choices=[0, 1, 2],
                        help="Number of audio items per prompt.")
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@ -76,5 +76,10 @@ if __name__ == "__main__":
                             GPUs_per_dp_rank))
        proc.start()
        procs.append(proc)
    exit_code = 0
    for proc in procs:
        proc.join()
        if proc.exitcode:
            exit_code = proc.exitcode
    exit(exit_code)
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@ -4,16 +4,23 @@ This example shows how to use vLLM for running offline inference with
 the explicit/implicit prompt format on enc-dec LMMs for text generation.
 """
 import time
 from collections.abc import Sequence
 from dataclasses import asdict
 from typing import NamedTuple
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, PromptType, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.utils import FlexibleArgumentParser
 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: Sequence[PromptType]
 def run_florence2():
-    # Create a Florence-2 encoder/decoder model instance
+    engine_args = EngineArgs(
    llm = LLM(
        model="microsoft/Florence-2-large",
        tokenizer="facebook/bart-large",
        max_num_seqs=8,
@ -39,12 +46,15 @@ def run_florence2():
            "decoder_prompt": "",
        },
    ]
-    return llm, prompts
+
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 def run_mllama():
-    # Create a Mllama encoder/decoder model instance
+    engine_args = EngineArgs(
    llm = LLM(
        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
        max_model_len=4096,
        max_num_seqs=2,
@ -69,12 +79,15 @@ def run_mllama():
            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",   # noqa: E501
        },
    ]
-    return llm, prompts
+
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 def run_whisper():
-    # Create a Whisper encoder/decoder model instance
+    engine_args = EngineArgs(
    llm = LLM(
        model="openai/whisper-large-v3-turbo",
        max_model_len=448,
        max_num_seqs=16,
@ -99,7 +112,11 @@ def run_whisper():
            "decoder_prompt": "<|startoftranscript|>",
        }
    ]
-    return llm, prompts
+
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 model_example_map = {
@ -114,7 +131,12 @@ def main(args):
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")
-    llm, prompts = model_example_map[model]()
+    req_data = model_example_map[model]()
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)
    prompts = req_data.prompts
    # Create a sampling params object.
    sampling_params = SamplingParams(
@ -153,6 +175,10 @@ if __name__ == "__main__":
                        default="mllama",
                        choices=model_example_map.keys(),
                        help='Huggingface "model_type".')
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -8,122 +8,164 @@ on HuggingFace model repository.
 """
 import os
 import random
 from dataclasses import asdict
 from typing import NamedTuple, Optional
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.lora.request import LoRARequest
 from vllm.utils import FlexibleArgumentParser
 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
 # lower-end GPUs.
 # Unless specified, these settings have been tested to work on a single L4.
 # Aria
-def run_aria(questions: list[str], modality: str):
+def run_aria(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "rhymes-ai/Aria"
    # NOTE: Need L40 (or equivalent) to avoid OOM
-    llm = LLM(model=model_name,
+    engine_args = EngineArgs(
-              max_model_len=4096,
+        model=model_name,
-              max_num_seqs=2,
+        max_model_len=4096,
-              dtype="bfloat16",
+        max_num_seqs=2,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        dtype="bfloat16",
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
                "<|im_end|>\n<|im_start|>assistant\n")
               for question in questions]
    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-    return llm, prompts, stop_token_ids
+
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
 # BLIP-2
-def run_blip2(questions: list[str], modality: str):
+def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
    prompts = [f"Question: {question} Answer:" for question in questions]
-    llm = LLM(model="Salesforce/blip2-opt-2.7b",
+    engine_args = EngineArgs(
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        model="Salesforce/blip2-opt-2.7b",
-    stop_token_ids = None
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
-    return llm, prompts, stop_token_ids
+    )
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Chameleon
-def run_chameleon(questions: list[str], modality: str):
+def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    prompts = [f"{question}<image>" for question in questions]
-    llm = LLM(model="facebook/chameleon-7b",
+    engine_args = EngineArgs(
-              max_model_len=4096,
+        model="facebook/chameleon-7b",
-              max_num_seqs=2,
+        max_model_len=4096,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        max_num_seqs=2,
-    stop_token_ids = None
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
-    return llm, prompts, stop_token_ids
+    )
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Deepseek-VL2
-def run_deepseek_vl2(questions: list[str], modality: str):
+def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "deepseek-ai/deepseek-vl2-tiny"
-    llm = LLM(model=model_name,
+    engine_args = EngineArgs(
-              max_model_len=4096,
+        model=model_name,
-              max_num_seqs=2,
+        max_model_len=4096,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        max_num_seqs=2,
-              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
    )
    prompts = [
        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
        for question in questions
    ]
-    stop_token_ids = None
+
-    return llm, prompts, stop_token_ids
+    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Florence2
-def run_florence2(question: str, modality: str):
+def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
-    llm = LLM(model="microsoft/Florence-2-large",
+    engine_args = EngineArgs(
-              tokenizer="facebook/bart-large",
+        model="microsoft/Florence-2-large",
-              max_num_seqs=8,
+        tokenizer="facebook/bart-large",
-              trust_remote_code=True,
+        max_num_seqs=8,
-              dtype="bfloat16",
+        trust_remote_code=True,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        dtype="bfloat16",
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
-    prompt = "<MORE_DETAILED_CAPTION>"
+    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
-    stop_token_ids = None
+
-    return llm, prompt, stop_token_ids
+    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Fuyu
-def run_fuyu(questions: list[str], modality: str):
+def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    prompts = [f"{question}\n" for question in questions]
-    llm = LLM(model="adept/fuyu-8b",
+    engine_args = EngineArgs(
-              max_model_len=2048,
+        model="adept/fuyu-8b",
-              max_num_seqs=2,
+        max_model_len=2048,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        max_num_seqs=2,
-    stop_token_ids = None
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
-    return llm, prompts, stop_token_ids
+    )
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Gemma 3
-def run_gemma3(questions: list[str], modality: str):
+def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
@ -135,22 +177,27 @@ def run_gemma3(questions: list[str], modality: str):
    prompts = [("<bos><start_of_turn>user\n"
                f"<start_of_image>{question}<end_of_turn>\n"
                "<start_of_turn>model\n") for question in questions]
-    stop_token_ids = None
+
-    return llm, prompts, stop_token_ids
+    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # GLM-4v
-def run_glm4v(questions: list[str], modality: str):
+def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "THUDM/glm-4v-9b"
-    llm = LLM(model=model_name,
+    engine_args = EngineArgs(
-              max_model_len=2048,
+        model=model_name,
-              max_num_seqs=2,
+        max_model_len=2048,
-              trust_remote_code=True,
+        max_num_seqs=2,
-              enforce_eager=True,
+        trust_remote_code=True,
-              hf_overrides={"architectures": ["GLM4VForCausalLM"]},
+        enforce_eager=True,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
    prompts = [
        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
@ -158,16 +205,21 @@ def run_glm4v(questions: list[str], modality: str):
    ]
    stop_token_ids = [151329, 151336, 151338]
-    return llm, prompts, stop_token_ids
+
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
 # H2OVL-Mississippi
-def run_h2ovl(questions: list[str], modality: str):
+def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "h2oai/h2ovl-mississippi-800m"
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
@ -187,15 +239,20 @@ def run_h2ovl(questions: list[str], modality: str):
    # Stop tokens for H2OVL-Mississippi
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
    stop_token_ids = [tokenizer.eos_token_id]
-    return llm, prompts, stop_token_ids
+
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
 # Idefics3-8B-Llama3
-def run_idefics3(questions: list[str], modality: str):
+def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
@ -212,17 +269,20 @@ def run_idefics3(questions: list[str], modality: str):
    prompts = [(
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
    ) for question in questions]
-    stop_token_ids = None
+
-    return llm, prompts, stop_token_ids
+    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # InternVL
-def run_internvl(questions: list[str], modality: str):
+def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "OpenGVLab/InternVL2-2B"
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
@ -245,53 +305,75 @@ def run_internvl(questions: list[str], modality: str):
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    return llm, prompts, stop_token_ids
+
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
 # LLaVA-1.5
-def run_llava(questions: list[str], modality: str):
+def run_llava(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    prompts = [
        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
    ]
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf",
+    engine_args = EngineArgs(
-              max_model_len=4096,
+        model="llava-hf/llava-1.5-7b-hf",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        max_model_len=4096,
-    stop_token_ids = None
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
-    return llm, prompts, stop_token_ids
+    )
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # LLaVA-1.6/LLaVA-NeXT
-def run_llava_next(questions: list[str], modality: str):
+def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
+    engine_args = EngineArgs(
-              max_model_len=8192,
+        model="llava-hf/llava-v1.6-mistral-7b-hf",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        max_model_len=8192,
-    stop_token_ids = None
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
-    return llm, prompts, stop_token_ids
+    )
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # LlaVA-NeXT-Video
 # Currently only support for video input
-def run_llava_next_video(questions: list[str], modality: str):
+def run_llava_next_video(questions: list[str],
                         modality: str) -> ModelRequestData:
    assert modality == "video"
    prompts = [
        f"USER: <video>\n{question} ASSISTANT:" for question in questions
    ]
-    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
+    engine_args = EngineArgs(
-              max_model_len=8192,
+        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        max_model_len=8192,
-    stop_token_ids = None
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
-    return llm, prompts, stop_token_ids
+    )
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # LLaVA-OneVision
-def run_llava_onevision(questions: list[str], modality: str):
+def run_llava_onevision(questions: list[str],
                        modality: str) -> ModelRequestData:
    if modality == "video":
        prompts = [
@ -305,15 +387,20 @@ def run_llava_onevision(questions: list[str], modality: str):
        <|im_start|>assistant\n" for question in questions
        ]
-    llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
+    engine_args = EngineArgs(
-              max_model_len=16384,
+        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        max_model_len=16384,
-    stop_token_ids = None
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
-    return llm, prompts, stop_token_ids
+    )
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Mantis
-def run_mantis(questions: list[str], modality: str):
+def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
@ -322,14 +409,19 @@ def run_mantis(questions: list[str], modality: str):
        for question in questions
    ]
-    llm = LLM(
+    engine_args = EngineArgs(
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
        max_model_len=4096,
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
    stop_token_ids = [128009]
-    return llm, prompts, stop_token_ids
+
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
 # MiniCPM-V
@ -357,7 +449,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
    # model_name = "openbmb/MiniCPM-o-2_6"
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True)
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
@ -389,19 +481,24 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
            tokenize=False,
            add_generation_prompt=True) for question in questions
    ]
-    return llm, prompts, stop_token_ids
+
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )
-def run_minicpmo(questions: list[str], modality: str):
+def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
-def run_minicpmv(questions: list[str], modality: str):
+def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
 # LLama 3.2
-def run_mllama(questions: list[str], modality: str):
+def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@ -411,7 +508,7 @@ def run_mllama(questions: list[str], modality: str):
    # You may lower either to run this example on lower-end GPUs.
    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=16,
@ -432,17 +529,20 @@ def run_mllama(questions: list[str], modality: str):
    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
-    stop_token_ids = None
+
-    return llm, prompts, stop_token_ids
+    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Molmo
-def run_molmo(questions: list[str], modality: str):
+def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "allenai/Molmo-7B-D-0924"
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        dtype="bfloat16",
@ -453,18 +553,21 @@ def run_molmo(questions: list[str], modality: str):
        f"<|im_start|>user <image>\n{question}<|im_end|> \
        <|im_start|>assistant\n" for question in questions
    ]
-    stop_token_ids = None
+
-    return llm, prompts, stop_token_ids
+    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # NVLM-D
-def run_nvlm_d(questions: list[str], modality: str):
+def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "nvidia/NVLM-D-72B"
    # Adjust this as necessary to fit in GPU
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
@ -481,36 +584,47 @@ def run_nvlm_d(questions: list[str], modality: str):
    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
-    stop_token_ids = None
+
-    return llm, prompts, stop_token_ids
+    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # PaliGemma
-def run_paligemma(question: str, modality: str):
+def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    # PaliGemma has special prompt format for VQA
-    prompt = ["caption en"]
+    prompts = ["caption en" for _ in questions]
-    llm = LLM(model="google/paligemma-3b-mix-224",
+    engine_args = EngineArgs(
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        model="google/paligemma-3b-mix-224",
-    stop_token_ids = None
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    return llm, prompt, stop_token_ids
+
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # PaliGemma 2
-def run_paligemma2(question: str, modality: str):
+def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    # PaliGemma 2 has special prompt format for VQA
-    prompt = ["caption en"]
+    prompts = ["caption en" for _ in questions]
-    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
+    engine_args = EngineArgs(
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        model="google/paligemma2-3b-ft-docci-448",
-    stop_token_ids = None
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    return llm, prompt, stop_token_ids
+
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Phi-3-Vision
-def run_phi3v(questions: list[str], modality: str):
+def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    prompts = [
@ -530,7 +644,7 @@ def run_phi3v(questions: list[str], modality: str):
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-    llm = LLM(
+    engine_args = EngineArgs(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
@ -539,12 +653,15 @@ def run_phi3v(questions: list[str], modality: str):
        mm_processor_kwargs={"num_crops": 16},
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
-    stop_token_ids = None
+
-    return llm, prompts, stop_token_ids
+    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Phi-4-multimodal-instruct
-def run_phi4mm(questions: list[str], modality: str):
+def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
@ -558,7 +675,7 @@ def run_phi4mm(questions: list[str], modality: str):
        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
        for question in questions
    ]
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_path,
        trust_remote_code=True,
        max_model_len=4096,
@ -567,24 +684,22 @@ def run_phi4mm(questions: list[str], modality: str):
        max_lora_rank=320,
        lora_extra_vocab_size=0,
    )
    lora_request = LoRARequest("vision", 1, vision_lora_path)
    # To maintain code compatibility in this script, we add LoRA here.
    llm.llm_engine.add_lora(lora_request=lora_request)
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
-    stop_token_ids = None
+    return ModelRequestData(
-    return llm, prompts, stop_token_ids
+        engine_args=engine_args,
        prompts=prompts,
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )
 # Pixtral HF-format
-def run_pixtral_hf(questions: list[str], modality: str):
+def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "mistral-community/pixtral-12b"
    # NOTE: Need L40 (or equivalent) to avoid OOM
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
@ -592,15 +707,18 @@ def run_pixtral_hf(questions: list[str], modality: str):
    )
    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
-    stop_token_ids = None
+
-    return llm, prompts, stop_token_ids
+    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Qwen
-def run_qwen_vl(questions: list[str], modality: str):
+def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
-    llm = LLM(
+    engine_args = EngineArgs(
        model="Qwen/Qwen-VL",
        trust_remote_code=True,
        max_model_len=1024,
@ -610,16 +728,19 @@ def run_qwen_vl(questions: list[str], modality: str):
    )
    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
-    stop_token_ids = None
+
-    return llm, prompts, stop_token_ids
+    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Qwen2-VL
-def run_qwen2_vl(questions: list[str], modality: str):
+def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
@ -642,16 +763,19 @@ def run_qwen2_vl(questions: list[str], modality: str):
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
-    stop_token_ids = None
+
-    return llm, prompts, stop_token_ids
+    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # Qwen2.5-VL
-def run_qwen2_5_vl(questions: list[str], modality: str):
+def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
@ -674,8 +798,11 @@ def run_qwen2_5_vl(questions: list[str], modality: str):
         f"{question}<|im_end|>\n"
         "<|im_start|>assistant\n") for question in questions
    ]
-    stop_token_ids = None
+
-    return llm, prompts, stop_token_ids
+    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 model_example_map = {
@ -789,18 +916,28 @@ def main(args):
    data = mm_input["data"]
    questions = mm_input["questions"]
-    llm, prompts, stop_token_ids = model_example_map[model](questions,
+    req_data = model_example_map[model](questions, modality)
-                                                            modality)
+
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)
    # To maintain code compatibility in this script, we add LoRA here.
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    if req_data.lora_requests:
        for lora_request in req_data.lora_requests:
            llm.llm_engine.add_lora(lora_request=lora_request)
    # Don't want to check the flag multiple times, so just hijack `prompts`.
-    prompts = prompts if args.use_different_prompt_per_request else [
+    prompts = req_data.prompts if args.use_different_prompt_per_request else [
-        prompts[0]
+        req_data.prompts[0]
    ]
    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
-                                     stop_token_ids=stop_token_ids)
+                                     stop_token_ids=req_data.stop_token_ids)
    assert args.num_prompts > 0
    if args.num_prompts == 1:
@ -865,6 +1002,10 @@ if __name__ == "__main__":
                        type=int,
                        default=16,
                        help='Number of frames to extract from the video.')
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
    parser.add_argument(
        '--image-repeat-prob',
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@ -7,11 +7,12 @@ For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
 from argparse import Namespace
 from dataclasses import asdict
 from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
 from PIL.Image import Image
-from vllm import LLM
+from vllm import LLM, EngineArgs
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser
@ -37,12 +38,12 @@ Query = Union[TextQuery, ImageQuery, TextImageQuery]
 class ModelRequestData(NamedTuple):
-    llm: LLM
+    engine_args: EngineArgs
    prompt: str
    image: Optional[Image]
-def run_e5_v(query: Query):
+def run_e5_v(query: Query) -> ModelRequestData:
    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
    if query["modality"] == "text":
@ -58,20 +59,20 @@ def run_e5_v(query: Query):
        modality = query['modality']
        raise ValueError(f"Unsupported query modality: '{modality}'")
-    llm = LLM(
+    engine_args = EngineArgs(
        model="royokong/e5-v",
        task="embed",
        max_model_len=4096,
    )
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        image=image,
    )
-def run_vlm2vec(query: Query):
+def run_vlm2vec(query: Query) -> ModelRequestData:
    if query["modality"] == "text":
        text = query["text"]
        prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
@ -87,7 +88,7 @@ def run_vlm2vec(query: Query):
        modality = query['modality']
        raise ValueError(f"Unsupported query modality: '{modality}'")
-    llm = LLM(
+    engine_args = EngineArgs(
        model="TIGER-Lab/VLM2Vec-Full",
        task="embed",
        trust_remote_code=True,
@ -95,7 +96,7 @@ def run_vlm2vec(query: Query):
    )
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        image=image,
    )
@ -126,15 +127,18 @@ def get_query(modality: QueryModality):
    raise ValueError(msg)
-def run_encode(model: str, modality: QueryModality):
+def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
    query = get_query(modality)
    req_data = model_example_map[model](query)
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)
    mm_data = {}
    if req_data.image is not None:
        mm_data["image"] = req_data.image
-    outputs = req_data.llm.embed({
+    outputs = llm.embed({
        "prompt": req_data.prompt,
        "multi_modal_data": mm_data,
    })
@ -144,7 +148,7 @@ def run_encode(model: str, modality: QueryModality):
 def main(args: Namespace):
-    run_encode(args.model_name, args.modality)
+    run_encode(args.model_name, args.modality, args.seed)
 model_example_map = {
@ -167,5 +171,10 @@ if __name__ == "__main__":
                        default="image",
                        choices=get_args(QueryModality),
                        help='Modality of the input.')
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
    args = parser.parse_args()
    main(args)
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -6,13 +6,14 @@ using the chat template defined by the model.
 """
 import os
 from argparse import Namespace
 from dataclasses import asdict
 from typing import NamedTuple, Optional
 from huggingface_hub import snapshot_download
 from PIL.Image import Image
 from transformers import AutoProcessor, AutoTokenizer
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser
@ -25,11 +26,12 @@ IMAGE_URLS = [
 class ModelRequestData(NamedTuple):
-    llm: LLM
+    engine_args: EngineArgs
    prompt: str
    stop_token_ids: Optional[list[int]]
    image_data: list[Image]
-    chat_template: Optional[str]
+    stop_token_ids: Optional[list[int]] = None
    chat_template: Optional[str] = None
    lora_requests: Optional[list[LoRARequest]] = None
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@ -37,53 +39,55 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.
-def load_aria(question, image_urls: list[str]) -> ModelRequestData:
+def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "rhymes-ai/Aria"
-    llm = LLM(model=model_name,
+    engine_args = EngineArgs(
-              tokenizer_mode="slow",
+        model=model_name,
-              trust_remote_code=True,
+        tokenizer_mode="slow",
-              dtype="bfloat16",
+        trust_remote_code=True,
-              limit_mm_per_prompt={"image": len(image_urls)})
+        dtype="bfloat16",
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
    prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
              "<|im_start|>assistant\n")
    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
-def load_deepseek_vl2(question: str, image_urls: list[str]):
+def load_deepseek_vl2(question: str,
                      image_urls: list[str]) -> ModelRequestData:
    model_name = "deepseek-ai/deepseek-vl2-tiny"
-    llm = LLM(model=model_name,
+    engine_args = EngineArgs(
-              max_model_len=4096,
+        model=model_name,
-              max_num_seqs=2,
+        max_model_len=4096,
-              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+        max_num_seqs=2,
-              limit_mm_per_prompt={"image": len(image_urls)})
+        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholder = "".join(f"image_{i}:<image>\n"
                          for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=None,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
-def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
+def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "google/gemma-3-4b-it"
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
@ -112,18 +116,16 @@ def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
                                           add_generation_prompt=True)
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=None,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
 def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "h2oai/h2ovl-mississippi-800m"
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
@ -146,19 +148,18 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
    stop_token_ids = [tokenizer.eos_token_id]
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
-def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
+def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
@ -177,18 +178,16 @@ def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
                             for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=None,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
 def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "OpenGVLab/InternVL2-2B"
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
@ -214,19 +213,18 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
-def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
+def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
    # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=16,
@ -236,19 +234,17 @@ def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
    placeholders = "<|image|>" * len(image_urls)
    prompt = f"{placeholders}<|begin_of_text|>{question}"
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=None,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
-def load_nvlm_d(question: str, image_urls: list[str]):
+def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "nvidia/NVLM-D-72B"
    # Adjust this as necessary to fit in GPU
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
@ -266,14 +262,11 @@ def load_nvlm_d(question: str, image_urls: list[str]):
    prompt = tokenizer.apply_chat_template(messages,
                                           tokenize=False,
                                           add_generation_prompt=True)
    stop_token_ids = None
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
@ -281,7 +274,7 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "mistral-community/pixtral-12b"
    # Adjust this as necessary to fit in GPU
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
@ -291,14 +284,11 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
    stop_token_ids = None
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
@ -315,7 +305,7 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-    llm = LLM(
+    engine_args = EngineArgs(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
@ -326,14 +316,11 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
    placeholders = "\n".join(f"<|image_{i}|>"
                             for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
    stop_token_ids = None
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=None,
    )
@ -347,7 +334,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_path,
        trust_remote_code=True,
        max_model_len=10000,
@ -357,30 +344,23 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
        max_lora_rank=320,
        lora_extra_vocab_size=0,
    )
    lora_request = LoRARequest("vision", 1, vision_lora_path)
    # To maintain code compatibility in this script, we add LoRA here.
    llm.llm_engine.add_lora(lora_request=lora_request)
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    placeholders = "".join(f"<|image_{i}|>"
                           for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
    stop_token_ids = None
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
+        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )
 def load_qwen_vl_chat(question: str,
                      image_urls: list[str]) -> ModelRequestData:
    model_name = "Qwen/Qwen-VL-Chat"
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=1024,
@ -411,7 +391,7 @@ def load_qwen_vl_chat(question: str,
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
@ -419,7 +399,7 @@ def load_qwen_vl_chat(question: str,
    )
-def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
+def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
@ -431,7 +411,7 @@ def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
    # Tested on L40
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=32768 if process_vision_info is None else 4096,
        max_num_seqs=5,
@ -460,23 +440,19 @@ def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
                                           tokenize=False,
                                           add_generation_prompt=True)
    stop_token_ids = None
    if process_vision_info is None:
        image_data = [fetch_image(url) for url in image_urls]
    else:
        image_data, _ = process_vision_info(messages)
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=image_data,
        chat_template=None,
    )
-def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
+def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
@ -487,7 +463,7 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
-    llm = LLM(
+    engine_args = EngineArgs(
        model=model_name,
        max_model_len=32768 if process_vision_info is None else 4096,
        max_num_seqs=5,
@ -516,8 +492,6 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
                                           tokenize=False,
                                           add_generation_prompt=True)
    stop_token_ids = None
    if process_vision_info is None:
        image_data = [fetch_image(url) for url in image_urls]
    else:
@ -525,11 +499,9 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
                                            return_video_kwargs=False)
    return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=image_data,
        chat_template=None,
    )
@ -551,14 +523,25 @@ model_example_map = {
 }
-def run_generate(model, question: str, image_urls: list[str]):
+def run_generate(model, question: str, image_urls: list[str],
                 seed: Optional[int]):
    req_data = model_example_map[model](question, image_urls)
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)
    # To maintain code compatibility in this script, we add LoRA here.
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    if req_data.lora_requests:
        for lora_request in req_data.lora_requests:
            llm.llm_engine.add_lora(lora_request=lora_request)
    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
                                     stop_token_ids=req_data.stop_token_ids)
-    outputs = req_data.llm.generate(
+    outputs = llm.generate(
        {
            "prompt": req_data.prompt,
            "multi_modal_data": {
@ -572,13 +555,24 @@ def run_generate(model, question: str, image_urls: list[str]):
        print(generated_text)
-def run_chat(model: str, question: str, image_urls: list[str]):
+def run_chat(model: str, question: str, image_urls: list[str],
             seed: Optional[int]):
    req_data = model_example_map[model](question, image_urls)
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)
    # To maintain code compatibility in this script, we add LoRA here.
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    if req_data.lora_requests:
        for lora_request in req_data.lora_requests:
            llm.llm_engine.add_lora(lora_request=lora_request)
    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
                                     stop_token_ids=req_data.stop_token_ids)
-    outputs = req_data.llm.chat(
+    outputs = llm.chat(
        [{
            "role":
            "user",
@ -607,11 +601,12 @@ def run_chat(model: str, question: str, image_urls: list[str]):
 def main(args: Namespace):
    model = args.model_type
    method = args.method
    seed = args.seed
    if method == "generate":
-        run_generate(model, QUESTION, IMAGE_URLS)
+        run_generate(model, QUESTION, IMAGE_URLS, seed)
    elif method == "chat":
-        run_chat(model, QUESTION, IMAGE_URLS)
+        run_chat(model, QUESTION, IMAGE_URLS, seed)
    else:
        raise ValueError(f"Invalid method: {method}")
@ -632,6 +627,10 @@ if __name__ == "__main__":
                        default="generate",
                        choices=["generate", "chat"],
                        help="The method to run in `vllm.LLM`.")
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
    args = parser.parse_args()
    main(args)
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -28,7 +28,7 @@ pyzmq
 msgspec
 gguf == 0.10.0
 importlib_metadata
-mistral_common[opencv] >= 1.5.0
+mistral_common[opencv] >= 1.5.4
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@ -15,7 +15,7 @@ pydantic >= 2.8
 torch
 py-cpuinfo
 transformers
-mistral_common >= 1.5.0
+mistral_common >= 1.5.4
 aiohttp
 starlette
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
--- a/requirements/test.in
+++ b/requirements/test.in
@ -8,6 +8,7 @@ pytest-shard
 # testing utils
 awscli
 backoff # required for phi4mm test
 decord # required for video tests
 einops # required for MPT, qwen-vl and Mamba
 httpx
@ -26,7 +27,7 @@ torchaudio==2.6.0
 torchvision==0.21.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[opencv] >= 1.5.0 # required for pixtral test
+mistral_common[opencv] >= 1.5.4 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 transformers==4.48.2 
@ -39,4 +40,4 @@ tritonclient==2.51.0
 numpy < 2.0.0
 runai-model-streamer==0.11.0
-runai-model-streamer-s3==0.11.0
+runai-model-streamer-s3==0.11.0
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -33,6 +33,8 @@ audioread==3.0.1
    # via librosa
 awscli==1.35.23
    # via -r requirements/test.in
 backoff==2.2.1
    # via -r requirements/test.in
 bitsandbytes==0.45.3
    # via -r requirements/test.in
 black==24.10.0
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@ -17,9 +17,9 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
--- a/setup.py
+++ b/setup.py
@ -294,26 +294,28 @@ class repackage_wheel(build_ext):
            ]).decode("utf-8")
            upstream_main_commit = json.loads(resp_json)["sha"]
-            # Check if the local main branch is up-to-date. This is to ensure
+            # Check if the upstream_main_commit exists in the local repo
-            # the base commit we found is the most recent commit on the main
+            try:
-            # branch.
+                subprocess.check_output(
-            local_main_commit = subprocess.check_output(
+                    ["git", "cat-file", "-e", f"{upstream_main_commit}"])
-                ["git", "rev-parse", "main"]).decode("utf-8").strip()
+            except subprocess.CalledProcessError:
-            if local_main_commit != upstream_main_commit:
+                # If not present, fetch it from the remote repository.
-                raise ValueError(
+                # Note that this does not update any local branches,
-                    f"Local main branch ({local_main_commit}) is not "
+                # but ensures that this commit ref and its history are
-                    "up-to-date with upstream main branch "
+                # available in our local repo.
-                    f"({upstream_main_commit}). Please pull the latest "
+                subprocess.check_call([
-                    "changes from upstream main branch first.")
+                    "git", "fetch", "https://github.com/vllm-project/vllm",
                    "main"
                ])
            # Then get the commit hash of the current branch that is the same as
            # the upstream main commit.
            current_branch = subprocess.check_output(
                ["git", "branch", "--show-current"]).decode("utf-8").strip()
-            base_commit = subprocess.check_output(
+            base_commit = subprocess.check_output([
-                ["git", "merge-base", "main",
+                "git", "merge-base", f"{upstream_main_commit}", current_branch
-                 current_branch]).decode("utf-8").strip()
+            ]).decode("utf-8").strip()
            return base_commit
        except ValueError as err:
            raise ValueError(err) from None
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -47,6 +47,7 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False])
 def test_models(
    monkeypatch: pytest.MonkeyPatch,
    hf_runner,
    model: str,
    backend: str,
@ -63,31 +64,33 @@ def test_models(
        pytest.skip(
            f"{backend} does not support gemma2 with full context length.")
-    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+    with monkeypatch.context() as m:
        m.setenv("VLLM_ATTENTION_BACKEND", backend)
-    # 5042 tokens for gemma2
+        # 5042 tokens for gemma2
-    # gemma2 has alternating sliding window size of 4096
+        # gemma2 has alternating sliding window size of 4096
-    # we need a prompt with more than 4096 tokens to test the sliding window
+        # we need a prompt with more than 4096 tokens to test the sliding window
-    prompt = "The following numbers of the sequence " + ", ".join(
+        prompt = "The following numbers of the sequence " + ", ".join(
-        str(i) for i in range(1024)) + " are:"
+            str(i) for i in range(1024)) + " are:"
-    example_prompts = [prompt]
+        example_prompts = [prompt]
-    with hf_runner(model, dtype=dtype) as hf_model:
+        with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    with VllmRunner(model,
+        with VllmRunner(model,
-                    max_model_len=8192,
+                        max_model_len=8192,
-                    dtype=dtype,
+                        dtype=dtype,
-                    enforce_eager=enforce_eager,
+                        enforce_eager=enforce_eager,
-                    gpu_memory_utilization=0.7) as vllm_model:
+                        gpu_memory_utilization=0.7) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                      max_tokens)
-    check_outputs_equal(
+        check_outputs_equal(
-        outputs_0_lst=hf_outputs,
+            outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
+            outputs_1_lst=vllm_outputs,
-        name_0="hf",
+            name_0="hf",
-        name_1="vllm",
+            name_1="vllm",
-    )
+        )
@multi_gpu_test(num_gpus=2)
@ -104,6 +107,7 @@ def test_models(
        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
    ])
 def test_models_distributed(
    monkeypatch: pytest.MonkeyPatch,
    hf_runner,
    vllm_runner,
    example_prompts,
@ -116,34 +120,41 @@ def test_models_distributed(
    if test_suite != TARGET_TEST_SUITE:
        pytest.skip(f"Skip test for {test_suite}")
-    if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+    with monkeypatch.context() as monkeypatch_context:
-        # test Ray Compiled Graph
+        if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+            # test Ray Compiled Graph
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+            monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
            monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
-    if attention_backend:
+        if attention_backend:
-        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
+            monkeypatch_context.setenv(
                "VLLM_ATTENTION_BACKEND",
                attention_backend,
            )
-    dtype = "half"
+        dtype = "half"
-    max_tokens = 5
+        max_tokens = 5
-    # NOTE: take care of the order. run vLLM first, and then run HF.
+        # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
+        # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
+        # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
+        # will hurt multiprocessing backend with fork method
-    with vllm_runner(model,
+        # (the default method).
-                     dtype=dtype,
+        with vllm_runner(
-                     tensor_parallel_size=2,
+                model,
-                     distributed_executor_backend=distributed_executor_backend
+                dtype=dtype,
-                     ) as vllm_model:
+                tensor_parallel_size=2,
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+                distributed_executor_backend=distributed_executor_backend,
        ) as vllm_model:
            vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                      max_tokens)
-    with hf_runner(model, dtype=dtype) as hf_model:
+        with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    check_outputs_equal(
+        check_outputs_equal(
-        outputs_0_lst=hf_outputs,
+            outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
+            outputs_1_lst=vllm_outputs,
-        name_0="hf",
+            name_0="hf",
-        name_1="vllm",
+            name_1="vllm",
-    )
+        )
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@ -7,16 +7,22 @@ prefill requests are chunked.
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
-import os
+
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import pytest
 from tests.kernels.utils import override_backend_env_variable
 from vllm.platforms import current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR
 from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import multi_gpu_test
 if TYPE_CHECKING:
    from .conftest import HfRunner, VllmRunner
 MODELS = [
    "facebook/opt-125m",
    "meta-llama/Llama-3.2-1B-Instruct",
@ -24,12 +30,14 @@ MODELS = [
@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
    """
    Since this module is V0 only, set VLLM_USE_V1=0 for
    all tests in the file.
    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    with monkeypatch.context() as m:
        m.setenv('VLLM_USE_V1', '0')
        yield
@pytest.mark.parametrize("model", MODELS)
@ -42,8 +50,8 @@ def use_v0_only(monkeypatch):
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models(
-    hf_runner,
+    hf_runner: HfRunner,
-    vllm_runner,
+    vllm_runner: VllmRunner,
    example_prompts,
    model: str,
    dtype: str,
@ -52,37 +60,39 @@ def test_models(
    enforce_eager: bool,
    tensor_parallel_size: int,
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Checks exact match decode between huggingface model and vllm runner with
    chunked prefill.
    """
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-    max_num_seqs = chunked_prefill_token_size
+        max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
+        max_num_batched_tokens = chunked_prefill_token_size
-    with hf_runner(model, dtype=dtype) as hf_model:
+        with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    with vllm_runner(
+        with vllm_runner(
-            model,
+                model,
-            dtype=dtype,
+                dtype=dtype,
-            max_num_batched_tokens=max_num_batched_tokens,
+                max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=True,
+                enable_chunked_prefill=True,
-            tensor_parallel_size=tensor_parallel_size,
+                tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
+                enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
+                max_num_seqs=max_num_seqs,
-    ) as vllm_model:
+        ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                      max_tokens)
-    check_outputs_equal(
+        check_outputs_equal(
-        outputs_0_lst=hf_outputs,
+            outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
+            outputs_1_lst=vllm_outputs,
-        name_0="hf",
+            name_0="hf",
-        name_1="vllm",
+            name_1="vllm",
-    )
+        )
@multi_gpu_test(num_gpus=2)
@ -90,57 +100,61 @@ def test_models(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models_distributed(
-    hf_runner,
+    hf_runner: HfRunner,
-    vllm_runner,
+    vllm_runner: VllmRunner,
    example_prompts,
    model: str,
    distributed_executor_backend: str,
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
        if (model == "meta-llama/Llama-3.2-1B-Instruct"
                and distributed_executor_backend == "ray"):
            # test Ray Compiled Graph
            m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
            m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
-    if (model == "meta-llama/Llama-3.2-1B-Instruct"
+        dtype = "half"
-            and distributed_executor_backend == "ray"):
+        max_tokens = 5
-        # test Ray Compiled Graph
+        chunked_prefill_token_size = 16
        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-    dtype = "half"
+        # Add a chunked prefill config.
-    max_tokens = 5
+        max_num_seqs = min(chunked_prefill_token_size, 256)
-    chunked_prefill_token_size = 16
+        assert chunked_prefill_token_size != -1
        enable_chunked_prefill = True
        max_num_batched_tokens = chunked_prefill_token_size
-    # Add a chunked prefill config.
+        # NOTE: take care of the order. run vLLM first, and then run HF.
-    max_num_seqs = min(chunked_prefill_token_size, 256)
+        # vLLM needs a fresh new process without cuda initialization.
-    assert chunked_prefill_token_size != -1
+        # if we run HF first, the cuda initialization will be done and it
-    enable_chunked_prefill = True
+        # will hurt multiprocessing backend with
-    max_num_batched_tokens = chunked_prefill_token_size
+        # fork method (the default method).
-    # NOTE: take care of the order. run vLLM first, and then run HF.
+        with vllm_runner(
-    # vLLM needs a fresh new process without cuda initialization.
+                model,
-    # if we run HF first, the cuda initialization will be done and it
+                dtype=dtype,
-    # will hurt multiprocessing backend with fork method (the default method).
+                tensor_parallel_size=2,
                max_num_seqs=max_num_seqs,
                enable_chunked_prefill=enable_chunked_prefill,
                max_num_batched_tokens=max_num_batched_tokens,
                distributed_executor_backend=distributed_executor_backend,
        ) as vllm_model:
            vllm_outputs = vllm_model.generate_greedy(
                example_prompts,
                max_tokens,
            )
-    with vllm_runner(
+        with hf_runner(model, dtype=dtype) as hf_model:
-            model,
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
            dtype=dtype,
            tensor_parallel_size=2,
            max_num_seqs=max_num_seqs,
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_batched_tokens=max_num_batched_tokens,
            distributed_executor_backend=distributed_executor_backend,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    with hf_runner(model, dtype=dtype) as hf_model:
+        check_outputs_equal(
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+            outputs_0_lst=hf_outputs,
-
+            outputs_1_lst=vllm_outputs,
-    check_outputs_equal(
+            name_0="hf",
-        outputs_0_lst=hf_outputs,
+            name_1="vllm",
-        outputs_1_lst=vllm_outputs,
+        )
        name_0="hf",
        name_1="vllm",
    )
@pytest.mark.parametrize(
@ -158,7 +172,7 @@ def test_models_distributed(
 # the async postprocessor
@pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_models_with_fp8_kv_cache(
-    vllm_runner,
+    vllm_runner: VllmRunner,
    example_prompts,
    kv_cache_dtype: str,
    model: str,
@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache(
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"])
 def test_with_prefix_caching(
-    vllm_runner,
+    vllm_runner: VllmRunner,
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,
@ -254,8 +268,10 @@ def test_with_prefix_caching(
        ) as vllm_model:
            outputs[enable] = []
            for prompt in full_prompts:
-                outputs[enable] += vllm_model.generate_greedy([prompt],
+                outputs[enable] += vllm_model.generate_greedy(
-                                                              max_tokens)
+                    [prompt],
                    max_tokens,
                )
    check_outputs_equal(
        outputs_0_lst=outputs[False],
@ -274,8 +290,8 @@ def test_with_prefix_caching(
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_models_cpu(
-    hf_runner,
+    hf_runner: HfRunner,
-    vllm_runner,
+    vllm_runner: VllmRunner,
    example_prompts,
    model: str,
    dtype: str,
@ -283,7 +299,7 @@ def test_models_cpu(
    chunked_prefill_token_size: int,
    enforce_eager: bool,
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    test_models(
        hf_runner,
@ -307,7 +323,7 @@ def test_models_cpu(
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_with_prefix_caching_cpu(
-    vllm_runner,
+    vllm_runner: VllmRunner,
    max_tokens: int,
    enforce_eager: bool,
    chunk_size: int,
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@ -7,10 +7,10 @@ from vllm import LLM, SamplingParams
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.utils import GiB_bytes
-from ..utils import fork_new_process_for_each_test
+from ..utils import create_new_process_for_each_test
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_python_error():
    """
    Test if Python error occurs when there's low-level
@ -36,7 +36,7 @@ def test_python_error():
        allocator.wake_up()
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_basic_cumem():
    # some tensors from default memory pool
    shape = (1024, 1024)
@ -69,7 +69,7 @@ def test_basic_cumem():
    assert torch.allclose(output, torch.ones_like(output) * 3)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_cumem_with_cudagraph():
    allocator = CuMemAllocator.get_instance()
    with allocator.use_memory_pool():
@ -114,7 +114,7 @@ def test_cumem_with_cudagraph():
    assert torch.allclose(y, x + 1)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
@pytest.mark.parametrize(
    "model, use_v1",
    [
@ -123,40 +123,38 @@ def test_cumem_with_cudagraph():
        # sleep mode with pytorch checkpoint
        ("facebook/opt-125m", False),
    ])
-def test_end_to_end(model: str, use_v1: bool):
+def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
-    import os
+    with monkeypatch.context() as m:
-    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-    free, total = torch.cuda.mem_get_info()
+        free, total = torch.cuda.mem_get_info()
-    used_bytes_baseline = total - free  # in case other process is running
+        used_bytes_baseline = total - free  # in case other process is running
-    llm = LLM(model, enable_sleep_mode=True)
+        llm = LLM(model, enable_sleep_mode=True)
-    prompt = "How are you?"
+        prompt = "How are you?"
-    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+        sampling_params = SamplingParams(temperature=0, max_tokens=10)
-    output = llm.generate(prompt, sampling_params)
+        output = llm.generate(prompt, sampling_params)
-    # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
+        # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
-    # which is difficult to measure in the test. therefore, we only
+        # which is difficult to measure in the test. therefore, we only
-    # test sleep level 1 here.
+        # test sleep level 1 here.
-    llm.sleep(level=1)
+        llm.sleep(level=1)
-    free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+        free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
-    used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+        used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
-    # now the memory usage is mostly cudagraph memory pool,
+        # now the memory usage is mostly cudagraph memory pool,
-    # and it should be less than the model weights (1B model, 2GiB weights)
+        # and it should be less than the model weights (1B model, 2GiB weights)
-    # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
+        # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
-    # is captured but cannot be releasesd from PyTorch due to a known bug,
+        # is captured but cannot be releasesd from PyTorch due to a known bug,
-    # therefore high memory usage after `llm.sleep` is called is expected.
+        # therefore high memory usage after `llm.sleep` is called is expected.
-    # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
+        # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
-    # in V1.
+        # in V1.
-    if use_v1:
+        if use_v1:
-        assert used_bytes < 7 * GiB_bytes
+            assert used_bytes < 7 * GiB_bytes
-    else:
+        else:
-        assert used_bytes < 2 * GiB_bytes
+            assert used_bytes < 2 * GiB_bytes
-    llm.wake_up()
+        llm.wake_up()
-    output2 = llm.generate(prompt, sampling_params)
+        output2 = llm.generate(prompt, sampling_params)
-    # cmp output
+        # cmp output
-    assert output[0].outputs[0].text == output2[0].outputs[0].text
+        assert output[0].outputs[0].text == output2[0].outputs[0].text
    del os.environ["VLLM_USE_V1"]
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 import dataclasses
 from typing import Optional
 import pytest
@ -22,75 +22,76 @@ class TestSetting:
    fullgraph: bool
 # representative settings for testing
 test_settings = [
    # basic llama model
    TestSetting(
        model="meta-llama/Llama-3.2-1B-Instruct",
        model_args=[],
        pp_size=2,
        tp_size=2,
        attn_backend="FLASHINFER",
        method="generate",
        fullgraph=True,
    ),
    # llama model with quantization
    TestSetting(
        model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
        model_args=["--quantization", "gptq"],
        pp_size=1,
        tp_size=1,
        attn_backend="FLASH_ATTN",
        method="generate",
        fullgraph=True,
    ),
    # MoE model
    TestSetting(
        model="ibm/PowerMoE-3b",
        model_args=[],
        pp_size=1,
        tp_size=2,
        attn_backend="FLASH_ATTN",
        method="generate",
        fullgraph=True,
    ),
    # embedding model
    TestSetting(
        model="BAAI/bge-multilingual-gemma2",
        model_args=["--task", "embed"],
        pp_size=1,
        tp_size=1,
        attn_backend="FLASH_ATTN",
        method="encode",
        fullgraph=True,
    ),
    # encoder-based embedding model (BERT)
    TestSetting(
        model="BAAI/bge-base-en-v1.5",
        model_args=["--task", "embed"],
        pp_size=1,
        tp_size=1,
        attn_backend="XFORMERS",
        method="encode",
        fullgraph=True,
    ),
    # vision language model
    TestSetting(
        model="microsoft/Phi-3.5-vision-instruct",
        model_args=["--trust-remote-code", "--max-model-len", "2048"],
        pp_size=2,
        tp_size=1,
        attn_backend="FLASH_ATTN",
        method="generate_with_image",
        fullgraph=False,
    ),
 ]
 # we cannot afford testing the full Catesian product
 # of all models and all levels
-@pytest.mark.parametrize("test_setting", test_settings)
+@pytest.mark.parametrize(
-def test_compile_correctness(test_setting: TestSetting):
+    "test_setting",
    [
        # basic llama model
        TestSetting(
            model="meta-llama/Llama-3.2-1B-Instruct",
            model_args=[],
            pp_size=2,
            tp_size=2,
            attn_backend="FLASHINFER",
            method="generate",
            fullgraph=True,
        ),
        # llama model with quantization
        TestSetting(
            model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
            model_args=["--quantization", "gptq"],
            pp_size=1,
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="generate",
            fullgraph=True,
        ),
        # MoE model
        TestSetting(
            model="ibm/PowerMoE-3b",
            model_args=[],
            pp_size=1,
            tp_size=2,
            attn_backend="FLASH_ATTN",
            method="generate",
            fullgraph=True,
        ),
        # embedding model
        TestSetting(
            model="BAAI/bge-multilingual-gemma2",
            model_args=["--task", "embed"],
            pp_size=1,
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="encode",
            fullgraph=True,
        ),
        # encoder-based embedding model (BERT)
        TestSetting(
            model="BAAI/bge-base-en-v1.5",
            model_args=["--task", "embed"],
            pp_size=1,
            tp_size=1,
            attn_backend="XFORMERS",
            method="encode",
            fullgraph=True,
        ),
        # vision language model
        TestSetting(
            model="microsoft/Phi-3.5-vision-instruct",
            model_args=["--trust-remote-code", "--max-model-len", "2048"],
            pp_size=2,
            tp_size=1,
            attn_backend="FLASH_ATTN",
            method="generate_with_image",
            fullgraph=False,
        ),
    ])
 def test_compile_correctness(
    monkeypatch: pytest.MonkeyPatch,
    test_setting: TestSetting,
 ):
    # this test is run under multiple suits, with different GPUs.
    # make sure we only run the test with correct CUDA devices.
    # don't use "<", as it will duplicate the tests.
@ -103,41 +104,45 @@ def test_compile_correctness(test_setting: TestSetting):
    fullgraph = test_setting.fullgraph
    if cuda_device_count_stateless() != pp_size * tp_size:
        pytest.skip("Not correct CUDA devices for the test.")
    import os
    os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
    final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
                ["-tp", str(tp_size)]
-    all_args: list[list[str]] = []
+    with monkeypatch.context() as m:
-    all_envs: list[Optional[dict[str, str]]] = []
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
        final_args = [
            "--enforce-eager", *model_args, "-pp",
            str(pp_size), "-tp",
            str(tp_size)
        ]
-    for level in [
+        all_args: list[list[str]] = []
-            CompilationLevel.NO_COMPILATION,
+        all_envs: list[dict[str, str] | None] = []
            CompilationLevel.PIECEWISE,
    ]:
        all_args.append(final_args + [f"-O{level}"])
        all_envs.append({})
-    # inductor will change the output, so we only compare if the output
+        for level in [
-    # is close, not exactly the same.
+                CompilationLevel.NO_COMPILATION,
-    compare_all_settings(
+                CompilationLevel.PIECEWISE,
-        model,
+        ]:
-        all_args,
+            all_args.append(final_args + [f"-O{level}"])
-        all_envs,
+            all_envs.append({})
        method=method if method != "generate" else "generate_close")
    all_envs.clear()
    all_args.clear()
-    for level in [
+        # inductor will change the output, so we only compare if the output
-            CompilationLevel.NO_COMPILATION,
+        # is close, not exactly the same.
-            CompilationLevel.DYNAMO_AS_IS,
+        compare_all_settings(
-            CompilationLevel.DYNAMO_ONCE,
+            model,
-    ]:
+            all_args,
-        all_args.append(final_args + [f"-O{level}"])
+            all_envs,
-        all_envs.append({})
+            method=method if method != "generate" else "generate_close")
-        if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
+        all_envs.clear()
-            # "DYNAMO_ONCE" will always use fullgraph
+        all_args.clear()
            all_envs[-1][
                "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
-    compare_all_settings(model, all_args * 3, all_envs, method=method)
+        for level in [
                CompilationLevel.NO_COMPILATION,
                CompilationLevel.DYNAMO_AS_IS,
                CompilationLevel.DYNAMO_ONCE,
        ]:
            all_args.append(final_args + [f"-O{level}"])
            all_envs.append({})
            if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
                # "DYNAMO_ONCE" will always use fullgraph
                all_envs[-1][
                    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
        compare_all_settings(model, all_args * 3, all_envs, method=method)
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@ -1,22 +1,115 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 from typing import Any
 import pytest
 import torch
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationLevel
 from vllm.platforms import current_platform
-from ..utils import fork_new_process_for_each_test
+from ..utils import create_new_process_for_each_test
-from .utils import TEST_MODELS, check_full_graph_support
+
@pytest.fixture(params=None, name="model_info")
 def models_list_fixture(request):
    TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
        ("facebook/opt-125m", {}),
        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
            "dtype": torch.float16,
            "quantization": "compressed-tensors"
        }),
        ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
            "dtype": torch.float16,
            "quantization": "compressed-tensors"
        }),
        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
            "quantization": "compressed-tensors"
        }),
        ("meta-llama/Llama-3.2-1B-Instruct", {}),
    ]
    if is_quant_method_supported("aqlm"):
        TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
            "quantization": "aqlm"
        }))
    # TODO: figure out why this fails.
    if False and is_quant_method_supported("gguf"):  # noqa: SIM223
        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
            "quantization": "gguf"
        }))
    if is_quant_method_supported("gptq"):
        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
            "quantization": "gptq"
        }))
    if is_quant_method_supported("gptq_marlin"):
        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
            "quantization": "gptq_marlin"
        }))
    if is_quant_method_supported("gptq_marlin_24"):
        TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
            "quantization": "gptq_marlin_24"
        }))
    if is_quant_method_supported("marlin"):
        TEST_MODELS.append(
            ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
                "quantization": "marlin"
            }))
    if not current_platform.is_rocm() and is_quant_method_supported("awq"):
        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
            "quantization": "AWQ"
        }))
    return TEST_MODELS
@pytest.mark.parametrize("model_info", TEST_MODELS)
@pytest.mark.parametrize(
    "optimization_level",
-    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
+    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
-@fork_new_process_for_each_test
+)
-def test_full_graph(model_info, optimization_level):
+@pytest.mark.parametrize("model_info", "", indirect=True)
-    model = model_info[0]
+@create_new_process_for_each_test()
-    model_kwargs = model_info[1]
+def test_full_graph(
-    check_full_graph_support(model,
+    monkeypatch: pytest.MonkeyPatch,
-                             model_kwargs,
+    model_info: tuple[str, dict[str, Any]],
-                             optimization_level,
+    optimization_level: int,
-                             tp_size=1)
+):
    model, model_kwargs = model_info
    with monkeypatch.context() as m:
        # make sure these models can be captured in full graph mode
        m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
        print(f"MODEL={model}")
        prompts = [
            "Hello, my name is",
            "The president of the United States is",
            "The capital of France is",
            "The future of AI is",
        ]
        sampling_params = SamplingParams(temperature=0)
        llm = LLM(
            model=model,
            enforce_eager=True,
            tensor_parallel_size=1,
            disable_custom_all_reduce=True,
            compilation_config=optimization_level,
            **model_kwargs,
        )
        outputs = llm.generate(prompts, sampling_params)
        # Print the outputs.
        for output in outputs:
            prompt = output.prompt
            generated_text = output.outputs[0].text
            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@ -1,93 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import torch
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
 from vllm.platforms import current_platform
 TEST_MODELS = [
    ("facebook/opt-125m", {}),
    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
        "dtype": torch.float16,
        "quantization": "compressed-tensors"
    }),
    ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
        "dtype": torch.float16,
        "quantization": "compressed-tensors"
    }),
    ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
        "quantization": "compressed-tensors"
    }),
    ("meta-llama/Llama-3.2-1B-Instruct", {}),
 ]
 if is_quant_method_supported("aqlm"):
    TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
        "quantization": "aqlm"
    }))
 # TODO: figure out why this fails.
 if False and is_quant_method_supported("gguf"):  # noqa: SIM223
    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
        "quantization": "gguf"
    }))
 if is_quant_method_supported("gptq"):
    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
        "quantization": "gptq"
    }))
 if is_quant_method_supported("gptq_marlin"):
    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
        "quantization": "gptq_marlin"
    }))
 if is_quant_method_supported("gptq_marlin_24"):
    TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
        "quantization": "gptq_marlin_24"
    }))
 if is_quant_method_supported("marlin"):
    TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
        "quantization": "marlin"
    }))
 if not current_platform.is_rocm() and is_quant_method_supported("awq"):
    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
        "quantization": "AWQ"
    }))
 def check_full_graph_support(model,
                             model_kwargs,
                             optimization_level,
                             tp_size=1):
    # make sure these models can be captured in full graph mode
    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
    print(f"MODEL={model}")
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
        "The capital of France is",
        "The future of AI is",
    ]
    sampling_params = SamplingParams(temperature=0)
    llm = LLM(model=model,
              enforce_eager=True,
              tensor_parallel_size=tp_size,
              disable_custom_all_reduce=True,
              compilation_config=optimization_level,
              **model_kwargs)
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -681,6 +681,17 @@ def hf_runner():
 class VllmRunner:
    """
    The default value of some arguments have been modified from
    :class:`~vllm.LLM` as follows:
    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
    - `seed`: Set to `0` instead of `None` for test reproducibility.
    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
    - `block_size`: Set to `16` instead of `None` to reduce memory usage.
    - `enable_chunked_prefill`: Set to `False` instead of `None` for
      test reproducibility.
    - `enforce_eager`: Set to `False` instead of `None` to test CUDA graph.
    """
    def __init__(
        self,
@ -688,6 +699,8 @@ class VllmRunner:
        task: TaskOption = "auto",
        tokenizer_name: Optional[str] = None,
        tokenizer_mode: str = "auto",
        trust_remote_code: bool = True,
        seed: Optional[int] = 0,
        # Use smaller max model length, otherwise bigger model cannot run due
        # to kv cache size limit.
        max_model_len: int = 1024,
@ -695,7 +708,7 @@ class VllmRunner:
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
        block_size: int = 16,
-        enable_chunked_prefill: bool = False,
+        enable_chunked_prefill: Optional[bool] = False,
        swap_space: int = 4,
        enforce_eager: Optional[bool] = False,
        **kwargs,
@ -705,8 +718,9 @@ class VllmRunner:
            task=task,
            tokenizer=tokenizer_name,
            tokenizer_mode=tokenizer_mode,
-            trust_remote_code=True,
+            trust_remote_code=trust_remote_code,
            dtype=dtype,
            seed=seed,
            swap_space=swap_space,
            enforce_eager=enforce_eager,
            disable_log_stats=disable_log_stats,
@ -1096,4 +1110,4 @@ def pytest_collection_modifyitems(config, items):
    skip_optional = pytest.mark.skip(reason="need --optional option to run")
    for item in items:
        if "optional" in item.keywords:
-            item.add_marker(skip_optional)
+            item.add_marker(skip_optional)
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@ -3,7 +3,10 @@
 Run `pytest tests/distributed/test_comm_ops.py`.
 """
-import os
+
 from __future__ import annotations
 from typing import Any, Callable
 import pytest
 import ray
@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel
@ray.remote(num_gpus=1, max_calls=1)
-def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
+def all_reduce_test_worker(
-                           distributed_init_port: str):
+    monkeypatch: pytest.MonkeyPatch,
    tp_size: int,
    pp_size: int,
    rank: int,
    distributed_init_port: str,
 ):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
-def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
+def all_gather_test_worker(
-                           distributed_init_port: str):
+    monkeypatch: pytest.MonkeyPatch,
    tp_size: int,
    pp_size: int,
    rank: int,
    distributed_init_port: str,
 ):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
-def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
+def broadcast_tensor_dict_test_worker(
-                                      distributed_init_port: str):
+    monkeypatch: pytest.MonkeyPatch,
    tp_size: int,
    pp_size: int,
    rank: int,
    distributed_init_port: str,
 ):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
-def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
+def send_recv_tensor_dict_test_worker(
-                                      distributed_init_port: str):
+    monkeypatch: pytest.MonkeyPatch,
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    tp_size: int,
    pp_size: int,
    rank: int,
    distributed_init_port: str,
 ):
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
-def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
+def send_recv_test_worker(
-                          distributed_init_port: str):
+    monkeypatch: pytest.MonkeyPatch,
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    tp_size: int,
    pp_size: int,
    rank: int,
    distributed_init_port: str,
 ):
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    init_test_distributed_environment(tp_size, pp_size, rank,
@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
    all_reduce_test_worker, all_gather_test_worker,
    broadcast_tensor_dict_test_worker
 ])
-def test_multi_process_tensor_parallel(tp_size, test_target):
+def test_multi_process_tensor_parallel(
-    multi_process_parallel(tp_size, 1, test_target)
+    monkeypatch: pytest.MonkeyPatch,
    tp_size: int,
    test_target: Callable[..., Any],
 ):
    multi_process_parallel(monkeypatch, tp_size, 1, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 2,
@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize(
    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
-def test_multi_process_pipeline_parallel(pp_size, test_target):
+def test_multi_process_pipeline_parallel(
-    multi_process_parallel(1, pp_size, test_target)
+    monkeypatch: pytest.MonkeyPatch,
    pp_size: int,
    test_target: Callable[..., Any],
 ):
    multi_process_parallel(monkeypatch, 1, pp_size, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 4,
@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target):
    broadcast_tensor_dict_test_worker
 ])
 def test_multi_process_tensor_parallel_pipeline_parallel(
-        tp_size, pp_size, test_target):
+    tp_size: int,
-    multi_process_parallel(tp_size, pp_size, test_target)
+    pp_size: int,
    test_target: Callable[..., Any],
    monkeypatch: pytest.MonkeyPatch,
 ):
    multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import random
 import pytest
@ -23,95 +22,115 @@ for i, v in enumerate(test_sizes):
@ray.remote(num_gpus=1, max_calls=1)
-def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
+def graph_allreduce(
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch: pytest.MonkeyPatch,
-    device = torch.device(f"cuda:{rank}")
+    tp_size,
-    torch.cuda.set_device(device)
+    pp_size,
-    init_test_distributed_environment(tp_size, pp_size, rank,
+    rank,
-                                      distributed_init_port)
+    distributed_init_port,
-    ensure_model_parallel_initialized(tp_size, pp_size)
+):
-    group = get_tensor_model_parallel_group().device_group
+    with monkeypatch.context() as m:
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
        torch.cuda.set_device(device)
        init_test_distributed_environment(tp_size, pp_size, rank,
                                          distributed_init_port)
        ensure_model_parallel_initialized(tp_size, pp_size)
        group = get_tensor_model_parallel_group().device_group
-    # A small all_reduce for warmup.
+        # A small all_reduce for warmup.
-    # this is needed because device communicators might be created lazily
+        # this is needed because device communicators might be created lazily
-    # (e.g. NCCL). This will ensure that the communicator is initialized
+        # (e.g. NCCL). This will ensure that the communicator is initialized
-    # before any communication happens, so that this group can be used for
+        # before any communication happens, so that this group can be used for
-    # graph capture immediately.
+        # graph capture immediately.
-    data = torch.zeros(1)
+        data = torch.zeros(1)
-    data = data.to(device=device)
+        data = data.to(device=device)
-    torch.distributed.all_reduce(data, group=group)
+        torch.distributed.all_reduce(data, group=group)
-    torch.cuda.synchronize()
+        torch.cuda.synchronize()
-    del data
+        del data
-    # we use the first group to communicate once
+        # we use the first group to communicate once
-    # and the second group to communicate twice
+        # and the second group to communicate twice
-    # and so on
+        # and so on
-    # this is used to demonstrate that each group can
+        # this is used to demonstrate that each group can
-    # communicate independently
+        # communicate independently
-    num_communication = rank // tp_size + 1
+        num_communication = rank // tp_size + 1
-    for sz in test_sizes:
+        for sz in test_sizes:
-        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-            with graph_capture(device=device) as graph_capture_context:
+                with graph_capture(device=device) as graph_capture_context:
-                # use integers so result matches NCCL exactly
+                    # use integers so result matches NCCL exactly
-                inp1 = torch.randint(1,
+                    inp1 = torch.randint(1,
-                                     16, (sz, ),
+                                         16, (sz, ),
-                                     dtype=dtype,
+                                         dtype=dtype,
-                                     device=torch.cuda.current_device())
+                                         device=torch.cuda.current_device())
-                inp2 = torch.randint(1,
+                    inp2 = torch.randint(1,
-                                     16, (sz, ),
+                                         16, (sz, ),
-                                     dtype=dtype,
+                                         dtype=dtype,
-                                     device=torch.cuda.current_device())
+                                         device=torch.cuda.current_device())
-                torch.cuda.synchronize()
+                    torch.cuda.synchronize()
-                graph = torch.cuda.CUDAGraph()
+                    graph = torch.cuda.CUDAGraph()
-                with torch.cuda.graph(graph,
+                    with torch.cuda.graph(graph,
-                                      stream=graph_capture_context.stream):
+                                          stream=graph_capture_context.stream):
-                    for i in range(num_communication):
+                        for i in range(num_communication):
-                        out1 = tensor_model_parallel_all_reduce(inp1)
+                            out1 = tensor_model_parallel_all_reduce(inp1)
-                        # the input buffer is immediately modified to test
+                            # the input buffer is immediately modified to test
-                        # synchronization
+                            # synchronization
-                        dist.all_reduce(inp1, group=group)
+                            dist.all_reduce(inp1, group=group)
-                        out2 = tensor_model_parallel_all_reduce(inp2)
+                            out2 = tensor_model_parallel_all_reduce(inp2)
-                        dist.all_reduce(inp2, group=group)
+                            dist.all_reduce(inp2, group=group)
-            graph.replay()
+                graph.replay()
-            torch.testing.assert_close(out1, inp1)
+                torch.testing.assert_close(out1, inp1)
-            torch.testing.assert_close(out2, inp2)
+                torch.testing.assert_close(out2, inp2)
@ray.remote(num_gpus=1, max_calls=1)
-def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
+def eager_allreduce(
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch: pytest.MonkeyPatch,
-    device = torch.device(f"cuda:{rank}")
+    tp_size,
-    torch.cuda.set_device(device)
+    pp_size,
-    init_test_distributed_environment(tp_size, pp_size, rank,
+    rank,
-                                      distributed_init_port)
+    distributed_init_port,
 ):
    with monkeypatch.context() as m:
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
        torch.cuda.set_device(device)
        init_test_distributed_environment(tp_size, pp_size, rank,
                                          distributed_init_port)
-    # we use the first group to communicate once
+        # we use the first group to communicate once
-    # and the second group to communicate twice
+        # and the second group to communicate twice
-    # and so on
+        # and so on
-    # this is used to demonstrate that each group can
+        # this is used to demonstrate that each group can
-    # communicate independently
+        # communicate independently
-    num_communication = rank // tp_size + 1
+        num_communication = rank // tp_size + 1
-    sz = 1024
+        sz = 1024
-    fa = get_tp_group().ca_comm
+        fa = get_tp_group().ca_comm
-    inp = torch.ones(sz, dtype=torch.float32, device=device)
+        inp = torch.ones(sz, dtype=torch.float32, device=device)
-    out = inp
+        out = inp
-    for _ in range(num_communication):
+        for _ in range(num_communication):
-        out = fa.all_reduce(out, registered=False)
+            out = fa.all_reduce(out, registered=False)
-    torch.testing.assert_close(out, inp * (tp_size**num_communication))
+        torch.testing.assert_close(out, inp * (tp_size**num_communication))
-    inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
+        inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
-    out = inp
+        out = inp
-    for _ in range(num_communication):
+        for _ in range(num_communication):
-        out = fa.all_reduce(out, registered=False)
+            out = fa.all_reduce(out, registered=False)
-    torch.testing.assert_close(out, inp * (tp_size**num_communication))
+        torch.testing.assert_close(out, inp * (tp_size**num_communication))
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
-def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
+def test_custom_allreduce(
    monkeypatch: pytest.MonkeyPatch,
    tp_size,
    pipeline_parallel_size,
    test_target,
 ):
    world_size = tp_size * pipeline_parallel_size
    if world_size > torch.cuda.device_count():
        pytest.skip("Not enough GPUs to run the test.")
-    multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
                           test_target)
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@ -8,7 +8,7 @@ import pytest
 from vllm.config import TaskOption
 from vllm.logger import init_logger
-from ..utils import compare_two_settings, fork_new_process_for_each_test
+from ..utils import compare_two_settings, create_new_process_for_each_test
 logger = init_logger("test_expert_parallel")
@ -209,7 +209,7 @@ def _compare_tp(
        for params in settings.iter_params(model_name)
    ],
 )
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_ep(
    model_name: str,
    parallel_setup: ParallelSetup,
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@ -17,7 +17,7 @@ from vllm.config import TaskOption
 from vllm.logger import init_logger
 from ..models.registry import HF_EXAMPLE_MODELS
-from ..utils import compare_two_settings, fork_new_process_for_each_test
+from ..utils import compare_two_settings, create_new_process_for_each_test
 logger = init_logger("test_pipeline_parallel")
@ -350,6 +350,10 @@ def _compare_tp(
    else:
        pp_env = None
    tp_env = {
        "VLLM_USE_V1": vllm_major_version,
    }
    pp_args = [
        *common_args,
        "--pipeline-parallel-size",
@ -374,14 +378,20 @@ def _compare_tp(
    ]
    try:
-        compare_two_settings(model_id, pp_args, tp_args, pp_env, method=method)
+        compare_two_settings(model_id,
                             pp_args,
                             tp_args,
                             pp_env,
                             tp_env,
                             method=method)
    except Exception:
-        if pp_env is None:
+        testing_ray_compiled_graph = pp_env is not None
-            raise
+        if testing_ray_compiled_graph and vllm_major_version == "0":
-        else:
+            # Ray Compiled Graph tests are flaky for V0,
            # Ray Compiled Graph tests are flaky,
            # so we don't want to fail the test
            logger.exception("Ray Compiled Graph tests failed")
        else:
            raise
@pytest.mark.parametrize(
@ -392,7 +402,7 @@ def _compare_tp(
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
    ],
 )
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_tp_language_generation(
    model_id: str,
    parallel_setup: ParallelSetup,
@ -421,7 +431,7 @@ def test_tp_language_generation(
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
    ],
 )
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_tp_language_embedding(
    model_id: str,
    parallel_setup: ParallelSetup,
@ -450,7 +460,7 @@ def test_tp_language_embedding(
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
    ],
 )
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_tp_multimodal_generation(
    model_id: str,
    parallel_setup: ParallelSetup,
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@ -7,33 +7,35 @@ import pytest
 from vllm.distributed.utils import get_pp_indices
-def test_custom_layer_partition():
+def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
-    def _verify(partition_str, num_layers, pp_size, goldens):
+    with monkeypatch.context() as m:
        bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
        os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
        for pp_rank, golden in enumerate(goldens):
            assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
        if bak is not None:
            os.environ["VLLM_PP_LAYER_PARTITION"] = bak
-    # Even partition
+        def _verify(partition_str, num_layers, pp_size, goldens):
-    _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+            bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
-    # Balanced partition
+            m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
-    _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
+            for pp_rank, golden in enumerate(goldens):
-    # Put reminder somewhere
+                assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
-    _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
+            if bak is not None:
-    # Invalid partition strings
+                m.setenv("VLLM_PP_LAYER_PARTITION", bak)
-    with pytest.raises(ValueError):
+
-        _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Even partition
-    with pytest.raises(ValueError):
+        _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-        _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Balanced partition
-    # Wrong number of partitions
+        _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
-    with pytest.raises(ValueError):
+        # Put reminder somewhere
-        _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
-    # Wrong number of layers
+        # Invalid partition strings
-    with pytest.raises(ValueError):
+        with pytest.raises(ValueError):
-        _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+            _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
        with pytest.raises(ValueError):
            _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
        # Wrong number of partitions
        with pytest.raises(ValueError):
            _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
        # Wrong number of layers
        with pytest.raises(ValueError):
            _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
@pytest.mark.parametrize(
@ -55,6 +57,10 @@ def test_custom_layer_partition():
        (5, 3, 1, (2, 4)),
        (5, 3, 2, (4, 5)),
    ])
-def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int,
+def test_uneven_auto_partition(
-                               pp_rank: int, indices: tuple[int, int]):
+    num_hidden_layers: int,
    pp_size: int,
    pp_rank: int,
    indices: tuple[int, int],
 ):
    assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@ -1,10 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
-import os
+from typing import TYPE_CHECKING
 import pytest
-from ..utils import compare_two_settings, fork_new_process_for_each_test
+from ..utils import compare_two_settings, create_new_process_for_each_test
 if TYPE_CHECKING:
    from typing_extensions import LiteralString
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
@ -14,19 +18,25 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test
    "FLASH_ATTN",
    "FLASHINFER",
 ])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
-def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
+def test_pp_cudagraph(
-    cudagraph_args = [
+    monkeypatch: pytest.MonkeyPatch,
-        # use half precision for speed and memory savings in CI environment
+    PP_SIZE: int,
-        "--dtype",
+    MODEL_NAME: str,
-        "float16",
+    ATTN_BACKEND: LiteralString,
-        "--pipeline-parallel-size",
+):
-        str(PP_SIZE),
+    with monkeypatch.context() as m:
-        "--distributed-executor-backend",
+        cudagraph_args = [
-        "mp",
+            # use half precision for speed and memory savings in CI environment
-    ]
+            "--dtype",
-    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
+            "float16",
            "--pipeline-parallel-size",
            str(PP_SIZE),
            "--distributed-executor-backend",
            "mp",
        ]
        m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
-    eager_args = cudagraph_args + ["--enforce-eager"]
+        eager_args = cudagraph_args + ["--enforce-eager"]
-    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
+        compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@ -49,7 +49,7 @@ TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
@pytest.mark.skipif(not current_platform.is_cuda()
                    and not current_platform.is_tpu(),
                    reason="V1 is currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
    """Run with the V1 Engine."""
    with monkeypatch.context() as m:
@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
        run_test(more_args)
-def test_lm_eval_accuracy_v0_engine(monkeypatch):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
    """Run with the V0 Engine."""
    with monkeypatch.context() as m:
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@ -4,12 +4,12 @@ import pytest
 from vllm import LLM
-from ...utils import fork_new_process_for_each_test
+from ...utils import create_new_process_for_each_test
@pytest.mark.parametrize("tp_size", [1, 2])
@pytest.mark.parametrize("backend", ["mp", "ray"])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_collective_rpc(tp_size, backend):
    if tp_size == 1 and backend == "ray":
        pytest.skip("Skip duplicate test case")
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@ -53,32 +53,37 @@ def cache_models():
@pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models")
-def test_offline_mode(monkeypatch):
+def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
    # Set HF to offline mode and ensure we can still construct an LLM
-    try:
+    with monkeypatch.context() as m:
-        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
+        try:
-        monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1")
+            m.setenv("HF_HUB_OFFLINE", "1")
            m.setenv("VLLM_NO_USAGE_STATS", "1")
-        def disable_connect(*args, **kwargs):
+            def disable_connect(*args, **kwargs):
-            raise RuntimeError("No http calls allowed")
+                raise RuntimeError("No http calls allowed")
-        monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect",
+            m.setattr(
-                            disable_connect)
+                urllib3.connection.HTTPConnection,
-        monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect",
+                "connect",
-                            disable_connect)
+                disable_connect,
            )
            m.setattr(
                urllib3.connection.HTTPSConnection,
                "connect",
                disable_connect,
            )
-        # Need to re-import huggingface_hub and friends to setup offline mode
+            # Need to re-import huggingface_hub
-        _re_import_modules()
+            # and friends to setup offline mode
-        # Cached model files should be used in offline mode
+            _re_import_modules()
-        for model_config in MODEL_CONFIGS:
+            # Cached model files should be used in offline mode
-            LLM(**model_config)
+            for model_config in MODEL_CONFIGS:
-    finally:
+                LLM(**model_config)
-        # Reset the environment after the test
+        finally:
-        # NB: Assuming tests are run in online mode
+            # Reset the environment after the test
-        monkeypatch.delenv("HF_HUB_OFFLINE")
+            # NB: Assuming tests are run in online mode
-        monkeypatch.delenv("VLLM_NO_USAGE_STATS")
+            _re_import_modules()
        _re_import_modules()
        pass
 def _re_import_modules():
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@ -70,7 +70,7 @@ def run_test(more_args):
@pytest.mark.skipif(not current_platform.is_cuda()
                    and not current_platform.is_tpu(),
                    reason="V1 currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
    """Run with the V1 Engine."""
    with monkeypatch.context() as m:
@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
-def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
                                    more_args):
    """Run with the V0 Engine."""
    with monkeypatch.context() as m:
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@ -5,13 +5,12 @@ from unittest.mock import Mock, patch
 import pytest
 import torch
 from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.openvino import OpenVinoPlatform
 from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
@pytest.fixture(autouse=True)
@ -25,87 +24,111 @@ def clear_cache():
    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
@pytest.mark.parametrize("use_v1", [True, False])
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
-def test_env(name: str, use_v1: bool, device: str, monkeypatch):
+def test_env(
    name: str,
    use_v1: bool,
    device: str,
    monkeypatch: pytest.MonkeyPatch,
 ):
    """Test that the attention selector can be set via environment variable.
    Note that we do not test FlashAttn because it is the default backend.
    """
-    monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+    with monkeypatch.context() as m:
-    override_backend_env_variable(monkeypatch, name)
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
        m.setenv(STR_BACKEND_ENV_VAR, name)
-    if device == "cpu":
+        if device == "cpu":
        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
                                       False)
        assert backend.get_name() == "TORCH_SDPA"
    elif device == "hip":
        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
                                       False)
        EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
        assert backend.get_name() == EXPECTED
    elif device == "openvino":
        with patch("vllm.attention.selector.current_platform",
                   OpenVinoPlatform()), patch.dict('sys.modules',
                                                   {'openvino': Mock()}):
            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
                                       False)
        assert backend.get_name() == "OPENVINO"
    else:
        if name in ["XFORMERS", "FLASHINFER"]:
            with patch("vllm.attention.selector.current_platform",
-                       CudaPlatform()):
+                       CpuPlatform()):
                backend = get_attn_backend(16, torch.float16, torch.float16,
                                           16, False)
-            EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+            assert backend.get_name() == "TORCH_SDPA"
        elif device == "hip":
            with patch("vllm.attention.selector.current_platform",
                       RocmPlatform()):
                backend = get_attn_backend(16, torch.float16, torch.float16,
                                           16, False)
            EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
            assert backend.get_name() == EXPECTED
        elif device == "openvino":
            with patch("vllm.attention.selector.current_platform",
                       OpenVinoPlatform()), patch.dict('sys.modules',
                                                       {'openvino': Mock()}):
                backend = get_attn_backend(16, torch.float16, torch.float16,
                                           16, False)
            assert backend.get_name() == "OPENVINO"
        else:
            if name in ["XFORMERS", "FLASHINFER"]:
                with patch("vllm.attention.selector.current_platform",
                           CudaPlatform()):
                    backend = get_attn_backend(16, torch.float16,
                                               torch.float16, 16, False)
                EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
                assert backend.get_name() == EXPECTED
-def test_flash_attn(monkeypatch):
+def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
    """Test FlashAttn validation."""
    # TODO: When testing for v1, pipe in `use_v1` as an argument to
    # get_attn_backend
-    override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
-    # Unsupported CUDA arch
+        # Unsupported CUDA arch
-    with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
+        monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
                            (7, 5))
        backend = get_attn_backend(16, torch.float16, None, 16, False)
        assert backend.get_name() != STR_FLASH_ATTN_VAL
-    # Unsupported data type
+        # Reset the monkeypatch for subsequent tests
-    backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
+        monkeypatch.undo()
    assert backend.get_name() != STR_FLASH_ATTN_VAL
-    # Unsupported kv cache data type
+        # Unsupported data type
-    backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
+        backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
-    # Unsupported block size
+        # Unsupported kv cache data type
-    backend = get_attn_backend(16, torch.float16, None, 8, False)
+        backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
-    # flash-attn is not installed
+        # Unsupported block size
-    with patch.dict('sys.modules', {'vllm_flash_attn': None}):
+        backend = get_attn_backend(16, torch.float16, None, 8, False)
        assert backend.get_name() != STR_FLASH_ATTN_VAL
        # flash-attn is not installed
        import sys
        original_module = sys.modules.get('vllm_flash_attn')
        monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
        backend = get_attn_backend(16, torch.float16, None, 16, False)
        assert backend.get_name() != STR_FLASH_ATTN_VAL
-    # Unsupported head size
+        # Restore the original module if it existed
-    backend = get_attn_backend(17, torch.float16, None, 16, False)
+        if original_module is not None:
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+            monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
                                original_module)
        else:
            monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
-    # Attention-free models should bypass env and use PlaceholderAttention
+        # Unsupported head size
-    backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
+        backend = get_attn_backend(17, torch.float16, None, 16, False)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
        # Attention-free models should bypass env and use PlaceholderAttention
        backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
        assert backend.get_name() != STR_FLASH_ATTN_VAL
@pytest.mark.parametrize("use_v1", [True, False])
-def test_invalid_env(use_v1: bool, monkeypatch):
+def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
    """Ignore the invalid env variable if it is set."""
    monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
-    with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+    with monkeypatch.context() as m, patch(
            "vllm.attention.selector.current_platform", CudaPlatform()):
        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
        # Test with head size 32
        backend = get_attn_backend(32, torch.float16, None, 16, False)
        EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
        assert backend.get_name() == EXPECTED
--- a/tests/kernels/test_awq.py
+++ b/tests/kernels/test_awq.py
@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import pytest
 import torch
@ -11,36 +9,38 @@ from vllm import _custom_ops as ops  # noqa: F401
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
                    reason="AWQ is not supported on this GPU type.")
-def test_awq_dequantize_opcheck():
+def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch):
-    os.environ["VLLM_USE_TRITON_AWQ"] = "0"
+    with monkeypatch.context() as m:
-    qweight = torch.randint(-2000000000,
+        m.setenv("VLLM_USE_TRITON_AWQ", "0")
-                            2000000000, (8192, 256),
+        qweight = torch.randint(-2000000000,
-                            device='cuda',
+                                2000000000, (8192, 256),
-                            dtype=torch.int32)
+                                device='cuda',
-    scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
+                                dtype=torch.int32)
-    zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
+        scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
-    split_k_iters = 0
+        zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
-    thx = 0
+        split_k_iters = 0
-    thy = 0
+        thx = 0
-    opcheck(torch.ops._C.awq_dequantize,
+        thy = 0
-            (qweight, scales, zeros, split_k_iters, thx, thy))
+        opcheck(torch.ops._C.awq_dequantize,
                (qweight, scales, zeros, split_k_iters, thx, thy))
@pytest.mark.skip(reason="Not working; needs investigation.")
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
                    reason="AWQ is not supported on this GPU type.")
-def test_awq_gemm_opcheck():
+def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
-    os.environ["VLLM_USE_TRITON_AWQ"] = "0"
+    with monkeypatch.context() as m:
-    input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
+        m.setenv("VLLM_USE_TRITON_AWQ", "0")
-    qweight = torch.randint(-2000000000,
+        input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
-                            2000000000, (8192, 256),
+        qweight = torch.randint(-2000000000,
-                            device='cuda',
+                                2000000000, (8192, 256),
-                            dtype=torch.int32)
+                                device='cuda',
-    scales = torch.randint(-2000000000,
+                                dtype=torch.int32)
-                           2000000000, (64, 256),
+        scales = torch.randint(-2000000000,
-                           device='cuda',
+                               2000000000, (64, 256),
-                           dtype=torch.int32)
+                               device='cuda',
-    qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
+                               dtype=torch.int32)
-    split_k_iters = 8
+        qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
-    opcheck(torch.ops._C.awq_gemm,
+        split_k_iters = 8
-            (input, qweight, qzeros, scales, split_k_iters))
+        opcheck(torch.ops._C.awq_gemm,
                (input, qweight, qzeros, scales, split_k_iters))
--- a/tests/kernels/test_rocm_attention_selector.py
+++ b/tests/kernels/test_rocm_attention_selector.py
@ -1,13 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 from unittest.mock import patch
 import pytest
 import torch
 from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.rocm import RocmPlatform
 from vllm.utils import STR_BACKEND_ENV_VAR
@pytest.fixture(autouse=True)
@ -17,15 +15,19 @@ def clear_cache():
    _cached_get_attn_backend.cache_clear()
-def test_selector(monkeypatch):
+def test_selector(monkeypatch: pytest.MonkeyPatch):
-    """Test that the attention selector for ROCm.
+    with monkeypatch.context() as m:
-    """
+        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
    override_backend_env_variable(monkeypatch, "ROCM_FLASH")
-    with patch("vllm.attention.selector.current_platform", RocmPlatform()):
+        # Set the current platform to ROCm using monkeypatch
        monkeypatch.setattr("vllm.attention.selector.current_platform",
                            RocmPlatform())
        # Test standard ROCm attention
        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
        assert (backend.get_name() == "ROCM_FLASH"
                or backend.get_name() == "ROCM_ATTN_VLLM_V1")
        # mla test for deepseek related
        backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
                                   False, True)
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@ -36,12 +36,12 @@ ALL_OPCHECK_TEST_UTILS: tuple[str, ...] = (
 class QKVInputs(NamedTuple):
    '''
-    Data structure for representing unpacked attention inputs, 
+    Data structure for representing unpacked attention inputs,
    query/key/values and their sequence lengths.
    Attributes:
-        * {query,key,value}: unpacked (batch_size x padded_seq_len x 
+        * {query,key,value}: unpacked (batch_size x padded_seq_len x
                             num_heads x head_size) attention inputs
        * q_seq_lens: query sequence lengths list
        * kv_seq_lens: shared key/value sequence lengths list
@ -56,14 +56,14 @@ class QKVInputs(NamedTuple):
 class QKVO(NamedTuple):
    '''
-    Data structure for representing unpacked attention inputs, 
+    Data structure for representing unpacked attention inputs,
    alongside unpacked known-correct attention output
    Attributes:
-        * qkv: unpacked (batch_size x padded_seq_len x 
+        * qkv: unpacked (batch_size x padded_seq_len x
                             num_heads x head_size) attention inputs
-        * ideal_output: unpacked (batch_size x padded_seq_len x 
+        * ideal_output: unpacked (batch_size x padded_seq_len x
                        num_heads x head_size) known-correct attention output
    '''
@ -77,7 +77,7 @@ class PackedQKVInputs(NamedTuple):
    Attributes:
-        * {query,key,value}: packed (number_of_tokens x num_heads 
+        * {query,key,value}: packed (number_of_tokens x num_heads
                             x head_size) attention inputs
        * q_start_loc_list: list of query start locations within packed tensor
        * kv_start_loc_list: shared list of key/value start locations within
@ -97,14 +97,14 @@ class PackedQKVInputs(NamedTuple):
 class PackedQKVO(NamedTuple):
    '''
-    Data structure for representing packed attention inputs, 
+    Data structure for representing packed attention inputs,
    alongside packed known-correct attention output
    Attributes:
-        * packed_qkv: packed (number_of_tokens x num_heads 
+        * packed_qkv: packed (number_of_tokens x num_heads
                      x head_size) attention inputs
-        * ideal_output: packed (number_of_tokens x num_heads 
+        * ideal_output: packed (number_of_tokens x num_heads
                        x head_size) known-correct attention output
    '''
@ -134,7 +134,7 @@ class PhaseTestParameters(NamedTuple):
    Attributes:
-        * packed_qkvo: packed (number_of_tokens x num_heads 
+        * packed_qkvo: packed (number_of_tokens x num_heads
                       x head_size) attention inputs & known-correct
                       output
        * kv_mmap: KV cache memory mapping, specific to this test phase &
@ -195,7 +195,7 @@ def make_causal_mask(
    Create a q_max_seq_len x kv_max_seq_len causal mask
    Arguments:
-    
+
    * q_max_seq_len: query max seq len
    * kv_max_seq_len: key/value max seq len
@ -320,9 +320,9 @@ def make_qkv(
    * max_kv_seq_len: max key/value seq len
    * num_heads
    * head_size
-    * is_encoder_decoder_attn: if True, query seqlen may differ from 
+    * is_encoder_decoder_attn: if True, query seqlen may differ from
-      key/value seqlen (as is often the case for cross-attention); 
+      key/value seqlen (as is often the case for cross-attention);
-      o/w, query/key/value seqlens match at each batch index 
+      o/w, query/key/value seqlens match at each batch index
      (max_kv_seq_len is unused)
    * force_kv_seq_lens: if not None, overrides kv sequence lengths
    * attn_type: encoder, decoder self, or enc/dec cross attention
@ -469,7 +469,7 @@ def pack_qkv(qkv: QKVInputs, device: Union[torch.device,
    Individually pack each of Q, K and V, each with dimensions batch_size x
    padded_seq_len x num_heads x head_size, into respective number_of_tokens x
    num_heads x head_size tensors.
-    
+
    For Q, number_of_tokens = sum(q_seq_lens).
    For K and V, number_of_tokens = sum(kv_seq_lens)
@ -619,9 +619,9 @@ def make_kv_cache(num_blocks: int,
    Returns:
    * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
-    *     for backend 'XFORMERS' 
+    *     for backend 'XFORMERS'
    * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
-    *     for backend 'FLASH_ATTN'  
+    *     for backend 'FLASH_ATTN'
    '''
    if backend == 'XFORMERS':
        kv_cache = torch.rand(
@ -662,20 +662,20 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
    Context:
    * Your goal is to test (1) prefill of N prompts, with prompt-lengths
      {K_i \\forall i \\in [0,N)}, followed by (2) decoding of a single token
-      for all N prompts (N tokens total); the resultant sequence lengths 
+      for all N prompts (N tokens total); the resultant sequence lengths
      after decode would be {K_i + 1 for i \\in [0,N)}
-    * The test you want to do requires (1) having the prefill slot mapping 
+    * The test you want to do requires (1) having the prefill slot mapping
-      for all tokens present during prefill, the number of which is 
+      for all tokens present during prefill, the number of which is
-      M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N 
+      M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N
      decoded tokens
-    
+
-    This function consumes a single 1D slot mapping, which is the 
+    This function consumes a single 1D slot mapping, which is the
    concatenation of N slot mappings each of length K_i + 1 (corresponding
    to the  sequence lengths after decode), with a total length of
    P = \\sum_i{K_i + 1} = M + N
    The prefill-phase slot mapping results from excising the (K_i + 1)-th entry
-    from each of the N subsequences in the slot mapping (i.e. omitting the 
+    from each of the N subsequences in the slot mapping (i.e. omitting the
    decoded token's mapping.)
    The N excised entries are appended to obtain the decode-phase slot mapping
@ -684,15 +684,15 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
    * slot_mapping_list: Length-P 1D slot mapping (as list) reflecting all N
      post-decode sequences
-    * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the 
+    * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the
      description above)
    * device: cuda, cpu, etc.
    Returns:
-    * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor) 
+    * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor)
      reflecting all N prefill prompts
-    * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting 
+    * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting
      all N decoded tokens
    '''
@ -725,7 +725,7 @@ def make_block_tables_slot_mapping(
    Then the minimum KV cache size in blocks is
-    total_cache_blocks = sum(num_blocks for all seqs) 
+    total_cache_blocks = sum(num_blocks for all seqs)
    Then, the blocktable mapping counts downward from
@ -734,7 +734,7 @@ def make_block_tables_slot_mapping(
    to
    block_base_addr
-    
+
    The constructed block-tables and slot-mapping are sized to the
    lengths of the sequences in their entirety (as reflected by seq_lens),
@ -749,7 +749,7 @@ def make_block_tables_slot_mapping(
    Return:
-    * block_tables_tensor: block table for sequence   
+    * block_tables_tensor: block table for sequence
    * slot_mapping_list: slot mapping for sequence
    * max_block_idx: the highest block address within this block table
    '''
@ -807,7 +807,7 @@ def make_test_metadata(
    encoder_test_params and cross_test_params arguments allow encoder
    attention and enc/dec cross-attention (respectively) to use distinct
    metadata values from decoder self-attention (decoder_test_params.)
-    
+
    if encoder_test_params and cross_test_params are None, the attention
    metadata will support decoder-only scenario.
@ -820,7 +820,7 @@ def make_test_metadata(
    * attn_backend_name: Backend for sourcing attention kernels
    * is_prompt: prefill if True, o/w decode
    * seq_lens: list of token counts for each sequence
-    * decoder_test_params: decoder self-attention test params; 
+    * decoder_test_params: decoder self-attention test params;
                           this function requires
                           kv_mmap (memory mapping) field
    * device: CPU or CUDA device
--- a/tests/kv_transfer/test_disagg.py
+++ b/tests/kv_transfer/test_disagg.py
--- a/tests/kv_transfer/test_module.py
+++ b/tests/kv_transfer/test_module.py
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@ -3,10 +3,9 @@
 import pytest
 import vllm
 from tests.utils import fork_new_process_for_each_test
 from vllm.lora.request import LoRARequest
-from ..utils import multi_gpu_test
+from ..utils import create_new_process_for_each_test, multi_gpu_test
 MODEL_PATH = "THUDM/chatglm3-6b"
@ -55,7 +54,7 @@ def v1(run_with_both_engines_lora):
    pass
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_chatglm3_lora(chatglm3_lora_files):
    llm = vllm.LLM(MODEL_PATH,
                   max_model_len=1024,
@ -75,7 +74,7 @@ def test_chatglm3_lora(chatglm3_lora_files):
@multi_gpu_test(num_gpus=4)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_chatglm3_lora_tp4(chatglm3_lora_files):
    llm = vllm.LLM(MODEL_PATH,
                   max_model_len=1024,
@ -96,7 +95,7 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
@multi_gpu_test(num_gpus=4)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
    llm = vllm.LLM(MODEL_PATH,
                   max_model_len=1024,
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@ -4,10 +4,9 @@ import pytest
 import ray
 import vllm
 from tests.utils import fork_new_process_for_each_test
 from vllm.lora.request import LoRARequest
-from ..utils import multi_gpu_test
+from ..utils import create_new_process_for_each_test, multi_gpu_test
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
@ -82,7 +81,7 @@ def v1(run_with_both_engines_lora):
 # V1 Test: Failing due to numerics on V1.
@pytest.mark.skip_v1
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_llama_lora(sql_lora_files):
    llm = vllm.LLM(MODEL_PATH,
@ -97,7 +96,7 @@ def test_llama_lora(sql_lora_files):
 # Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
 # used by the engine yet.
@pytest.mark.skip_v1
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_llama_lora_warmup(sql_lora_files):
    """Test that the LLM initialization works with a warmup LORA path and
    is more conservative"""
@ -128,7 +127,7 @@ def test_llama_lora_warmup(sql_lora_files):
 # V1 Test: Failing due to numerics on V1.
@pytest.mark.skip_v1
@multi_gpu_test(num_gpus=4)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_llama_lora_tp4(sql_lora_files):
    llm = vllm.LLM(
@ -143,7 +142,7 @@ def test_llama_lora_tp4(sql_lora_files):
@multi_gpu_test(num_gpus=4)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
    llm = vllm.LLM(
@ -159,7 +158,7 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
@multi_gpu_test(num_gpus=4)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
    llm = vllm.LLM(
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@ -3,11 +3,12 @@
 import pytest
 import vllm
 from tests.utils import fork_new_process_for_each_test
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform
 from ..utils import create_new_process_for_each_test
 MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
 PROMPT_TEMPLATE = (
@ -57,7 +58,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
@pytest.mark.xfail(
    current_platform.is_rocm(),
    reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_minicpmv_lora(minicpmv_lora_files):
    llm = vllm.LLM(
        MODEL_PATH,
@ -80,7 +81,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
@pytest.mark.xfail(
    current_platform.is_rocm(),
    reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
    llm = vllm.LLM(
        MODEL_PATH,
@ -101,7 +102,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
@pytest.mark.xfail(
    current_platform.is_rocm(),
    reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
    llm = vllm.LLM(
        MODEL_PATH,
--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@ -3,10 +3,9 @@
 import pytest
 import vllm
 from tests.utils import fork_new_process_for_each_test
 from vllm.lora.request import LoRARequest
-from ..utils import multi_gpu_test
+from ..utils import create_new_process_for_each_test, multi_gpu_test
 MODEL_PATH = "ArthurZ/ilama-3.2-1B"
@ -56,7 +55,7 @@ def v1(run_with_both_engines_lora):
@pytest.mark.skip_v1
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_ilama_lora(ilama_lora_files):
    llm = vllm.LLM(MODEL_PATH,
                   max_model_len=1024,
@ -77,7 +76,7 @@ def test_ilama_lora(ilama_lora_files):
@pytest.mark.skip_v1
@multi_gpu_test(num_gpus=4)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_ilama_lora_tp4(ilama_lora_files):
    llm = vllm.LLM(MODEL_PATH,
                   max_model_len=1024,
@ -99,7 +98,7 @@ def test_ilama_lora_tp4(ilama_lora_files):
@pytest.mark.skip_v1
@multi_gpu_test(num_gpus=4)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
    llm = vllm.LLM(MODEL_PATH,
                   max_model_len=1024,
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@ -12,11 +12,10 @@ import pytest
 from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
 from vllm.platforms import current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR
 from ...utils import check_logprobs_close
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
@ -55,45 +54,47 @@ def test_models(
    backend: str,
    tensor_parallel_size: int,
    disable_async_output_proc: bool,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Only checks log probs match to cover the discrepancy in
    numerical sensitive kernels.
    """
-    override_backend_env_variable(monkeypatch, backend)
+    with monkeypatch.context() as m:
        m.setenv("TOKENIZERS_PARALLELISM", 'true')
        m.setenv(STR_BACKEND_ENV_VAR, backend)
-    MAX_MODEL_LEN = 1024
+        MAX_MODEL_LEN = 1024
-    NUM_LOG_PROBS = 8
+        NUM_LOG_PROBS = 8
-    with vllm_runner(
+        with vllm_runner(
-            base_model,
+                base_model,
-            max_model_len=MAX_MODEL_LEN,
+                max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=tensor_parallel_size,
+                tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
+                enforce_eager=enforce_eager,
-            kv_cache_dtype="auto",
+                kv_cache_dtype="auto",
-            disable_async_output_proc=disable_async_output_proc,
+                disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
+        ) as vllm_model:
-        baseline_outputs = vllm_model.generate_greedy_logprobs(
+            baseline_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS)
-    with vllm_runner(
+        with vllm_runner(
-            test_model,
+                test_model,
-            max_model_len=MAX_MODEL_LEN,
+                max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=tensor_parallel_size,
+                tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
+                enforce_eager=enforce_eager,
-            kv_cache_dtype=kv_cache_dtype,
+                kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
+                disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
+        ) as vllm_model:
-        test_outputs = vllm_model.generate_greedy_logprobs(
+            test_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS)
-    check_logprobs_close(
+        check_logprobs_close(
-        outputs_0_lst=baseline_outputs,
+            outputs_0_lst=baseline_outputs,
-        outputs_1_lst=test_outputs,
+            outputs_1_lst=test_outputs,
-        name_0="fp16_kv_cache",
+            name_0="fp16_kv_cache",
-        name_1="fp8_kv_cache",
+            name_1="fp8_kv_cache",
-    )
+        )
@pytest.mark.cpu_model
@ -119,38 +120,41 @@ def test_cpu_models(
    test_model: str,
    max_tokens: int,
    disable_async_output_proc: bool,
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Only checks log probs match to cover the discrepancy in
    numerical sensitive kernels.
    """
    with monkeypatch.context() as m:
        m.setenv("TOKENIZERS_PARALLELISM", 'true')
-    MAX_MODEL_LEN = 1024
+        MAX_MODEL_LEN = 1024
-    NUM_LOG_PROBS = 8
+        NUM_LOG_PROBS = 8
-    with vllm_runner(
+        with vllm_runner(
-            base_model,
+                base_model,
-            max_model_len=MAX_MODEL_LEN,
+                max_model_len=MAX_MODEL_LEN,
-            dtype="bfloat16",
+                dtype="bfloat16",
-            kv_cache_dtype="auto",
+                kv_cache_dtype="auto",
-            disable_async_output_proc=disable_async_output_proc,
+                disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
+        ) as vllm_model:
-        baseline_outputs = vllm_model.generate_greedy_logprobs(
+            baseline_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS)
-    with vllm_runner(
+        with vllm_runner(
-            test_model,
+                test_model,
-            max_model_len=MAX_MODEL_LEN,
+                max_model_len=MAX_MODEL_LEN,
-            dtype="bfloat16",
+                dtype="bfloat16",
-            kv_cache_dtype=kv_cache_dtype,
+                kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
+                disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
+        ) as vllm_model:
-        test_outputs = vllm_model.generate_greedy_logprobs(
+            test_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS)
-    check_logprobs_close(
+        check_logprobs_close(
-        outputs_0_lst=baseline_outputs,
+            outputs_0_lst=baseline_outputs,
-        outputs_1_lst=test_outputs,
+            outputs_1_lst=test_outputs,
-        name_0="bf16_kv_cache",
+            name_0="bf16_kv_cache",
-        name_1="fp8_kv_cache",
+            name_1="fp8_kv_cache",
-    )
+        )
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@ -201,6 +201,7 @@ def test_models(
    )
@pytest.mark.skip("RE-ENABLE: test is currently failing on main.")
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@ -17,7 +17,7 @@ from vllm.utils import identity
 from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
                          _VideoAssets)
-from ....utils import (fork_new_process_for_each_test, large_gpu_mark,
+from ....utils import (create_new_process_for_each_test, large_gpu_mark,
                       multi_gpu_marks)
 from ...utils import check_outputs_equal
 from .vlm_utils import custom_inputs, model_utils, runners
@ -592,7 +592,7 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
-        fork_new_process_for_each_test=False,
+        create_new_process_for_each_test=False,
    ))
 def test_single_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
@ -617,7 +617,7 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
-        fork_new_process_for_each_test=False,
+        create_new_process_for_each_test=False,
    ))
 def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                            test_case: ExpandableVLMTestArgs,
@ -642,7 +642,7 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
-        fork_new_process_for_each_test=False,
+        create_new_process_for_each_test=False,
    ))
 def test_image_embedding_models(model_type: str,
                                test_case: ExpandableVLMTestArgs,
@ -666,7 +666,7 @@ def test_image_embedding_models(model_type: str,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
-        fork_new_process_for_each_test=False,
+        create_new_process_for_each_test=False,
    ))
 def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
@ -688,7 +688,7 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
-        fork_new_process_for_each_test=False,
+        create_new_process_for_each_test=False,
    ))
 def test_custom_inputs_models(
    model_type: str,
@ -714,9 +714,9 @@ def test_custom_inputs_models(
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
-        fork_new_process_for_each_test=True,
+        create_new_process_for_each_test=True,
    ))
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
                                   hf_runner: type[HfRunner],
@ -740,9 +740,9 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
-        fork_new_process_for_each_test=True,
+        create_new_process_for_each_test=True,
    ))
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                  test_case: ExpandableVLMTestArgs,
                                  hf_runner: type[HfRunner],
@ -766,9 +766,9 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
-        fork_new_process_for_each_test=True,
+        create_new_process_for_each_test=True,
    ))
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_image_embedding_models_heavy(model_type: str,
                                      test_case: ExpandableVLMTestArgs,
                                      hf_runner: type[HfRunner],
@ -791,7 +791,7 @@ def test_image_embedding_models_heavy(model_type: str,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
-        fork_new_process_for_each_test=True,
+        create_new_process_for_each_test=True,
    ))
 def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
                            hf_runner: type[HfRunner],
@ -814,9 +814,9 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
    get_parametrized_options(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
-        fork_new_process_for_each_test=True,
+        create_new_process_for_each_test=True,
    ))
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_custom_inputs_models_heavy(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
--- a/tests/models/decoder_only/vision_language/test_phi4mm.py
+++ b/tests/models/decoder_only/vision_language/test_phi4mm.py
@ -0,0 +1,229 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import re
 from typing import Optional
 import pytest
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import rescale_image_size
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
    "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
    "cherry_blossom":
    "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
 })
 HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
 model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
 # Since the vision-lora and speech-lora co-exist with the base model,
 # we have to manually specify the path of the lora weights.
 vision_lora_path = os.path.join(model_path, "vision-lora")
 models = [model_path]
 def vllm_to_hf_output(vllm_output: tuple[list[int], str,
                                         Optional[SampleLogprobs]],
                      model: str):
    """Sanitize vllm output to be comparable with hf output."""
    _, output_str, out_logprobs = vllm_output
    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
    assert output_str_without_image[0] == " "
    output_str_without_image = output_str_without_image[1:]
    hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
    tokenizer = AutoTokenizer.from_pretrained(model)
    hf_output_ids = tokenizer.encode(output_str_without_image)
    assert hf_output_ids[0] == 1
    hf_output_ids = hf_output_ids[1:]
    return hf_output_ids, hf_output_str, out_logprobs
 target_dtype = "half"
 # ROCm Triton FA can run into shared memory issues with these models,
 # use other backends in the meantime
 # FIXME (mattwong, gshtrasb, hongxiayan)
 if current_platform.is_rocm():
    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
 def run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    inputs: list[tuple[list[str], PromptImageInput]],
    model: str,
    *,
    max_model_len: int,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    mm_limit: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
    """Inference result should be the same between hf and vllm.
    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects
    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
    # max_model_len should be greater than image_feature_size
    with vllm_runner(
            model,
            task="generate",
            max_model_len=max_model_len,
            max_num_seqs=2,
            dtype=dtype,
            limit_mm_per_prompt={"image": mm_limit},
            tensor_parallel_size=tensor_parallel_size,
            distributed_executor_backend=distributed_executor_backend,
            enable_lora=True,
            max_lora_rank=320,
            lora_extra_vocab_size=0,
            gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
            enforce_eager=True,
    ) as vllm_model:
        lora_request = LoRARequest("vision", 1, vision_lora_path)
        vllm_model.model.llm_engine.add_lora(lora_request=lora_request)
        vllm_outputs_per_case = [
            vllm_model.generate_greedy_logprobs(prompts,
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                images=images)
            for prompts, images in inputs
        ]
    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
    hf_model_kwargs = {"_attn_implementation": "eager"}
    with hf_runner(model, dtype=dtype,
                   model_kwargs=hf_model_kwargs) as hf_model:
        eos_token_id = hf_model.processor.tokenizer.eos_token_id
        hf_outputs_per_case = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,
                                                    num_logprobs=num_logprobs,
                                                    images=images,
                                                    eos_token_id=eos_token_id,
                                                    num_logits_to_keep=0)
            for prompts, images in inputs
        ]
    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
                                        vllm_outputs_per_case):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_outputs,
            name_0="hf",
            name_1="vllm",
        )
 # Since we use _attn_implementation="eager" for hf_runner, there is more
 # significant numerical difference. The basic `logprobs=5` fails to pass.
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
    "size_factors",
    [
        # No image
        [],
        # Single-scale
        [1.0],
        # Single-scale, batched
        [1.0, 1.0, 1.0],
        # Multi-scale
        [0.7, 0.75, 1.0],
    ],
 )
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [4096])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
                dtype: str, max_model_len: int, max_tokens: int,
                num_logprobs: int) -> None:
    images = [asset.pil_image for asset in image_assets]
    inputs_per_image = [(
        [prompt for _ in size_factors],
        [rescale_image_size(image, factor) for factor in size_factors],
    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    run_test(
        hf_runner,
        vllm_runner,
        inputs_per_image,
        model,
        dtype=dtype,
        max_model_len=max_model_len,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        mm_limit=1,
        tensor_parallel_size=1,
    )
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
    "size_factors",
    [
        # No image
        # [],
        # Single-scale
        [1.0],
        # Single-scale, batched
        [1.0, 1.0, 1.0],
        # Multi-scale
        [0.25, 0.5, 1.0],
    ],
 )
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [10000])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
@pytest.mark.xfail(
    reason="Phi-4-MM multi-image inference is divergent with hf model.")
 def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
                             size_factors, dtype: str, max_model_len: int,
                             max_tokens: int, num_logprobs: int) -> None:
    images = [asset.pil_image for asset in image_assets]
    inputs_per_case = [
        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
         [[rescale_image_size(image, factor) for image in images]
          for factor in size_factors])
    ]
    run_test(
        hf_runner,
        vllm_runner,
        inputs_per_case,
        model,
        dtype=dtype,
        max_model_len=max_model_len,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        mm_limit=2,
        tensor_parallel_size=1,
    )
--- a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
@ -13,9 +13,9 @@ from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
                    ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
-def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo],
+def get_filtered_test_settings(
-                               test_type: VLMTestType,
+        test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
-                               fork_per_test: bool) -> dict[str, VLMTestInfo]:
+        new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
    """Given the dict of potential test settings to run, return a subdict
    of tests who have the current test type enabled with the matching val for
    fork_per_test.
@ -43,7 +43,7 @@ def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo],
            # Everything looks okay; keep if this is has correct proc handling
            if (test_info.distributed_executor_backend
-                    is not None) == fork_per_test:
+                    is not None) == new_proc_per_test:
                matching_tests[test_name] = test_info
    return matching_tests
@ -51,14 +51,14 @@ def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo],
 def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
                             test_type: VLMTestType,
-                             fork_new_process_for_each_test: bool):
+                             create_new_process_for_each_test: bool):
    """Converts all of our VLMTestInfo into an expanded list of parameters.
    This is similar to nesting pytest parametrize calls, but done directly
    through an itertools product so that each test can set things like
    size factors etc, while still running in isolated test cases.
    """
    matching_tests = get_filtered_test_settings(
-        test_settings, test_type, fork_new_process_for_each_test)
+        test_settings, test_type, create_new_process_for_each_test)
    # Ensure that something is wrapped as an iterable it's not already
    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 import importlib.util
 import math
@ -11,6 +12,7 @@ from scipy.spatial.distance import cosine
 import vllm
 import vllm.config
 from vllm.utils import STR_BACKEND_ENV_VAR
 from ....utils import RemoteOpenAIServer
@ -29,36 +31,34 @@ def _arr(arr):
    return array("i", arr)
-def test_find_array(monkeypatch):
+def test_find_array(monkeypatch: pytest.MonkeyPatch):
    # GritLM embedding implementation is only supported by XFormers backend.
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
-    from vllm.model_executor.models.gritlm import GritLMPooler
+        from vllm.model_executor.models.gritlm import GritLMPooler
-    # Create an LLM object to get the model config.
+        # Create an LLM object to get the model config.
-    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+        llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
-    pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
+        pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
-    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+        arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
-    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
-    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
-    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
-    assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
+        assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
-    with pytest.raises(ValueError):
+        with pytest.raises(ValueError):
-        pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
+            pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
@pytest.fixture(scope="module")
 def server_embedding():
    # GritLM embedding implementation is only supported by XFormers backend.
-    with pytest.MonkeyPatch.context() as mp:
+    args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
-        mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-
+        yield remote_server
        args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
            yield remote_server
@pytest.fixture(scope="module")
@ -69,9 +69,12 @@ def server_generate():
@pytest_asyncio.fixture
-async def client_embedding(server_embedding: RemoteOpenAIServer):
+async def client_embedding(monkeypatch: pytest.MonkeyPatch,
-    async with server_embedding.get_async_client() as async_client:
+                           server_embedding: RemoteOpenAIServer):
-        yield async_client
+    with monkeypatch.context() as m:
        m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
        async with server_embedding.get_async_client() as async_client:
            yield async_client
@pytest_asyncio.fixture
@ -80,14 +83,20 @@ async def client_generate(server_generate: RemoteOpenAIServer):
        yield async_client
-def run_llm_encode(llm: vllm.LLM, queries: list[str],
+def run_llm_encode(
-                   instruction: str) -> list[float]:
+    llm: vllm.LLM,
    queries: list[str],
    instruction: str,
 ) -> list[float]:
    outputs = llm.encode([instruction + q for q in queries], )
    return [output.outputs.embedding for output in outputs]
-async def run_client_embeddings(client: vllm.LLM, queries: list[str],
+async def run_client_embeddings(
-                                instruction: str) -> list[float]:
+    client: vllm.LLM,
    queries: list[str],
    instruction: str,
 ) -> list[float]:
    outputs = await client.embeddings.create(
        model=MODEL_NAME,
        input=[instruction + q for q in queries],
@ -106,7 +115,7 @@ def get_test_data():
    README.md in https://github.com/ContextualAI/gritlm
    """
    q_instruction = gritlm_instruction(
-        "Given a scientific paper title, retrieve the paper's abstract")
+        "Given a scientific paper title, retrieve the paper's abstract", )
    queries = [
        "Bitcoin: A Peer-to-Peer Electronic Cash System",
        "Generative Representational Instruction Tuning",
@ -136,31 +145,32 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
    assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
-def test_gritlm_offline_embedding(monkeypatch):
+def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
    # GritLM embedding implementation is only supported by XFormers backend.
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
-    queries, q_instruction, documents, d_instruction = get_test_data()
+        queries, q_instruction, documents, d_instruction = get_test_data()
-    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+        llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
-    d_rep = run_llm_encode(
+        d_rep = run_llm_encode(
-        llm,
+            llm,
-        documents,
+            documents,
-        d_instruction,
+            d_instruction,
-    )
+        )
-    q_rep = run_llm_encode(
+        q_rep = run_llm_encode(
-        llm,
+            llm,
-        queries,
+            queries,
-        q_instruction,
+            q_instruction,
-    )
+        )
-    validate_embed_output(q_rep, d_rep)
+        validate_embed_output(q_rep, d_rep)
@pytest.mark.asyncio
 async def test_gritlm_api_server_embedding(
-        client_embedding: openai.AsyncOpenAI):
+    client_embedding: openai.AsyncOpenAI, ):
    queries, q_instruction, documents, d_instruction = get_test_data()
    d_rep = await run_client_embeddings(
--- a/tests/models/encoder_decoder/audio_language/test_whisper.py
+++ b/tests/models/encoder_decoder/audio_language/test_whisper.py
@ -10,7 +10,7 @@ import pytest
 from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
-from ....utils import fork_new_process_for_each_test, multi_gpu_test
+from ....utils import create_new_process_for_each_test, multi_gpu_test
 PROMPTS = [
    {
@ -119,7 +119,7 @@ def run_test(
        assert output.outputs[0].text == expected
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
@pytest.mark.core_model
@pytest.mark.parametrize(
    "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@ -1,86 +1,100 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import pytest
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
-from ..utils import fork_new_process_for_each_test
+from ..utils import create_new_process_for_each_test
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
-def test_plugin(dummy_opt_path, monkeypatch):
+def test_plugin(
    monkeypatch: pytest.MonkeyPatch,
    dummy_opt_path: str,
 ):
    # V1 shuts down rather than raising an error here.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    with monkeypatch.context() as m:
-    os.environ["VLLM_PLUGINS"] = ""
+        m.setenv("VLLM_USE_V1", "0")
-    with pytest.raises(Exception) as excinfo:
+        m.setenv("VLLM_PLUGINS", "")
-        LLM(model=dummy_opt_path, load_format="dummy")
+
-    error_msg = "has no vLLM implementation and " \
+        with pytest.raises(Exception) as excinfo:
-                "the Transformers implementation is not compatible with vLLM"
+            LLM(model=dummy_opt_path, load_format="dummy")
-    assert (error_msg in str(excinfo.value))
+        error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM"  # noqa: E501
        assert (error_msg in str(excinfo.value))
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
-def test_oot_registration_text_generation(dummy_opt_path):
+def test_oot_registration_text_generation(
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
+    monkeypatch: pytest.MonkeyPatch,
-    prompts = ["Hello, my name is", "The text does not matter"]
+    dummy_opt_path: str,
-    sampling_params = SamplingParams(temperature=0)
+):
-    llm = LLM(model=dummy_opt_path, load_format="dummy")
+    with monkeypatch.context() as m:
-    first_token = llm.get_tokenizer().decode(0)
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
-    outputs = llm.generate(prompts, sampling_params)
+        prompts = ["Hello, my name is", "The text does not matter"]
        sampling_params = SamplingParams(temperature=0)
        llm = LLM(model=dummy_opt_path, load_format="dummy")
        first_token = llm.get_tokenizer().decode(0)
        outputs = llm.generate(prompts, sampling_params)
-    for output in outputs:
+        for output in outputs:
-        generated_text = output.outputs[0].text
+            generated_text = output.outputs[0].text
-        # make sure only the first token is generated
+            # make sure only the first token is generated
-        rest = generated_text.replace(first_token, "")
+            rest = generated_text.replace(first_token, "")
-        assert rest == ""
+            assert rest == ""
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
-def test_oot_registration_embedding(dummy_gemma2_embedding_path):
+def test_oot_registration_embedding(
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
+    monkeypatch: pytest.MonkeyPatch,
-    prompts = ["Hello, my name is", "The text does not matter"]
+    dummy_gemma2_embedding_path: str,
-    llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
+):
-    outputs = llm.embed(prompts)
+    with monkeypatch.context() as m:
        m.setenv("VLLM_PLUGINS", "register_dummy_model")
        prompts = ["Hello, my name is", "The text does not matter"]
        llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
        outputs = llm.embed(prompts)
-    for output in outputs:
+        for output in outputs:
-        assert all(v == 0 for v in output.outputs.embedding)
+            assert all(v == 0 for v in output.outputs.embedding)
 image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
-def test_oot_registration_multimodal(dummy_llava_path, monkeypatch):
+def test_oot_registration_multimodal(
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
+    monkeypatch: pytest.MonkeyPatch,
-    prompts = [{
+    dummy_llava_path: str,
-        "prompt": "What's in the image?<image>",
+):
-        "multi_modal_data": {
+    with monkeypatch.context() as m:
-            "image": image
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
-        },
+        prompts = [{
-    }, {
+            "prompt": "What's in the image?<image>",
-        "prompt": "Describe the image<image>",
+            "multi_modal_data": {
-        "multi_modal_data": {
+                "image": image
-            "image": image
+            },
-        },
+        }, {
-    }]
+            "prompt": "Describe the image<image>",
            "multi_modal_data": {
                "image": image
            },
        }]
-    sampling_params = SamplingParams(temperature=0)
+        sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model=dummy_llava_path,
+        llm = LLM(model=dummy_llava_path,
-              load_format="dummy",
+                  load_format="dummy",
-              max_num_seqs=1,
+                  max_num_seqs=1,
-              trust_remote_code=True,
+                  trust_remote_code=True,
-              gpu_memory_utilization=0.98,
+                  gpu_memory_utilization=0.98,
-              max_model_len=4096,
+                  max_model_len=4096,
-              enforce_eager=True,
+                  enforce_eager=True,
-              limit_mm_per_prompt={"image": 1})
+                  limit_mm_per_prompt={"image": 1})
-    first_token = llm.get_tokenizer().decode(0)
+        first_token = llm.get_tokenizer().decode(0)
-    outputs = llm.generate(prompts, sampling_params)
+        outputs = llm.generate(prompts, sampling_params)
-    for output in outputs:
+        for output in outputs:
-        generated_text = output.outputs[0].text
+            generated_text = output.outputs[0].text
-        # make sure only the first token is generated
+            # make sure only the first token is generated
-        rest = generated_text.replace(first_token, "")
+            rest = generated_text.replace(first_token, "")
-        assert rest == ""
+            assert rest == ""
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@ -17,7 +17,7 @@ from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
                                                 ModelRegistry)
 from vllm.platforms import current_platform
-from ..utils import fork_new_process_for_each_test
+from ..utils import create_new_process_for_each_test
 from .registry import HF_EXAMPLE_MODELS
@ -45,7 +45,7 @@ def test_registry_imports(model_arch):
        assert supports_multimodal(model_cls)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
    ("LlamaForCausalLM", False, False, False),
    ("MllamaForConditionalGeneration", True, False, False),
@ -70,7 +70,7 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
                stacklevel=2)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
    ("MLPSpeculatorPreTrainedModel", False, False),
    ("DeepseekV2ForCausalLM", True, False),
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@ -235,25 +235,28 @@ async def test_bad_request(tmp_socket):
@pytest.mark.asyncio
-async def test_mp_crash_detection(monkeypatch):
+async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
-    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
+        parser = FlexibleArgumentParser(
-    parser = make_arg_parser(parser)
+            description="vLLM's remote OpenAI server.")
-    args = parser.parse_args([])
+        parser = make_arg_parser(parser)
        args = parser.parse_args([])
-    # When LLMEngine is loaded, it will crash.
+        # When LLMEngine is loaded, it will crash.
-    def mock_init():
+        def mock_init():
-        raise ValueError
+            raise ValueError
-    monkeypatch.setattr(LLMEngine, "__init__", mock_init)
+        m.setattr(LLMEngine, "__init__", mock_init)
-    start = time.perf_counter()
+        start = time.perf_counter()
-    async with build_async_engine_client(args):
+        async with build_async_engine_client(args):
-        pass
+            pass
-    end = time.perf_counter()
+        end = time.perf_counter()
-    assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s "
+        assert end - start < 60, (
-                              "if there is an error in the startup.")
+            "Expected vLLM to gracefully shutdown in <60s "
            "if there is an error in the startup.")
@pytest.mark.asyncio
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@ -5,7 +5,7 @@ from typing import Optional
 import pytest
-from tests.kernels.utils import override_backend_env_variable
+from vllm.utils import STR_BACKEND_ENV_VAR
 from ..models.utils import check_logprobs_close
 from ..utils import (completions_with_server_args, get_client_text_generations,
@ -52,7 +52,7 @@ async def test_multi_step(
    num_logprobs: Optional[int],
    attention_backend: str,
    enable_chunked_prefill: bool,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
    client/server environment.
@ -82,67 +82,70 @@ async def test_multi_step(
        pytest.skip("Multi-step with Chunked-Prefill only supports"
                    "PP=1 and FLASH_ATTN backend")
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-    prompts = example_prompts
+        prompts = example_prompts
-    if len(prompts) < num_prompts:
+        if len(prompts) < num_prompts:
-        prompts = prompts * ((num_prompts // len(prompts)) + 1)
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
-    prompts = prompts[:num_prompts]
+        prompts = prompts[:num_prompts]
-    assert len(prompts) == num_prompts
+        assert len(prompts) == num_prompts
-    server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
+        server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
-    ms_server_args = DEFAULT_SERVER_ARGS + \
+        ms_server_args = DEFAULT_SERVER_ARGS + \
-        ["--num-scheduler-steps", f"{num_scheduler_steps}"]
+            ["--num-scheduler-steps", f"{num_scheduler_steps}"]
-    if not is_async:
+        if not is_async:
-        ms_server_args += ["--disable-async-output-proc"]
+            ms_server_args += ["--disable-async-output-proc"]
-    if eager_mode:
+        if eager_mode:
-        ms_server_args.append("--enforce-eager")
+            ms_server_args.append("--enforce-eager")
-    if enable_chunked_prefill:
+        if enable_chunked_prefill:
-        ms_server_args.append("--enable-chunked-prefill")
+            ms_server_args.append("--enable-chunked-prefill")
-    distributed_args = [
+        distributed_args = [
-        "--tensor-parallel-size",
+            "--tensor-parallel-size",
-        str(tp_size),
+            str(tp_size),
-        "--pipeline-parallel-size",
+            "--pipeline-parallel-size",
-        str(pp_size),
+            str(pp_size),
-    ]
+        ]
-    # Spin up client/server & issue completion API requests.
+        # Spin up client/server & issue completion API requests.
-    # Default `max_wait_seconds` is 240 but was empirically
+        # Default `max_wait_seconds` is 240 but was empirically
-    # was raised 5x to 1200 *just for this test* due to
+        # was raised 5x to 1200 *just for this test* due to
-    # observed timeouts in GHA CI
+        # observed timeouts in GHA CI
-    ref_completions = await completions_with_server_args(
+        ref_completions = await completions_with_server_args(
-        prompts,
+            prompts,
-        model,
+            model,
-        server_args + distributed_args,
+            server_args + distributed_args,
-        num_logprobs,
+            num_logprobs,
-        max_wait_seconds=5 * 240)
+            max_wait_seconds=5 * 240)
-    test_completions = await completions_with_server_args(
+        test_completions = await completions_with_server_args(
-        prompts,
+            prompts,
-        model,
+            model,
-        ms_server_args + distributed_args,
+            ms_server_args + distributed_args,
-        num_logprobs,
+            num_logprobs,
-        max_wait_seconds=5 * 240)
+            max_wait_seconds=5 * 240)
-    # Assert multi-step scheduling produces identical tokens
+        # Assert multi-step scheduling produces identical tokens
-    # to single-step scheduling.
+        # to single-step scheduling.
-    ref_generations = get_client_text_generations(ref_completions)
+        ref_generations = get_client_text_generations(ref_completions)
-    test_generations = get_client_text_generations(test_completions)
+        test_generations = get_client_text_generations(test_completions)
-    assert ref_generations == test_generations
+        assert ref_generations == test_generations
-    # Assert multi-step scheduling produces nearly-identical logprobs
+        # Assert multi-step scheduling produces nearly-identical logprobs
-    # to single-step scheduling.
+        # to single-step scheduling.
-    ref_text_logprobs = get_client_text_logprob_generations(ref_completions)
+        ref_text_logprobs = get_client_text_logprob_generations(
-    test_text_logprobs = get_client_text_logprob_generations(test_completions)
+            ref_completions)
-    check_logprobs_close(
+        test_text_logprobs = get_client_text_logprob_generations(
-        outputs_0_lst=ref_text_logprobs,
+            test_completions)
-        outputs_1_lst=test_text_logprobs,
+        check_logprobs_close(
-        name_0="hf",
+            outputs_0_lst=ref_text_logprobs,
-        name_1="vllm",
+            outputs_1_lst=test_text_logprobs,
-    )
+            name_0="hf",
            name_1="vllm",
        )
@pytest.mark.parametrize(("tp_size, pp_size"), [
@ -152,7 +155,7 @@ async def test_multi_step(
 async def test_multi_step_pp_smoke(
    tp_size: int,
    pp_size: int,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Smoke test for the vLLM engine with multi-step scheduling in an
@ -174,54 +177,55 @@ async def test_multi_step_pp_smoke(
    attention_backend = "FLASH_ATTN"
    max_num_seqs = 3
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-    # Prompt from the ShareGPT dataset
+        # Prompt from the ShareGPT dataset
-    prompts = [
+        prompts = [
-        "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
-        "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
-        "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
-        "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
-    ]
+        ]
-    # Use varying max_tokens to introduce scheduling randomness.
+        # Use varying max_tokens to introduce scheduling randomness.
-    max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
+        max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
-    assert len(prompts) == len(max_tokens)
+        assert len(prompts) == len(max_tokens)
-    test_args = [
+        test_args = [
-        "--tensor-parallel-size",
+            "--tensor-parallel-size",
-        str(tp_size), "--pipeline-parallel-size",
+            str(tp_size), "--pipeline-parallel-size",
-        str(pp_size), "--max-num-seqs",
+            str(pp_size), "--max-num-seqs",
-        str(max_num_seqs)
+            str(max_num_seqs)
-    ]
+        ]
-    server_args = DEFAULT_SERVER_ARGS + test_args
+        server_args = DEFAULT_SERVER_ARGS + test_args
-    ms_server_args = DEFAULT_SERVER_ARGS + \
+        ms_server_args = DEFAULT_SERVER_ARGS + \
-       ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
+          ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
-       test_args
+          test_args
-    # Spin up client/server & issue completion API requests.
+        # Spin up client/server & issue completion API requests.
-    # Default `max_wait_seconds` is 240 but was empirically
+        # Default `max_wait_seconds` is 240 but was empirically
-    # was raised 3x to 720 *just for this test* due to
+        # was raised 3x to 720 *just for this test* due to
-    # observed timeouts in GHA CI
+        # observed timeouts in GHA CI
-    ref_completions = await completions_with_server_args(
+        ref_completions = await completions_with_server_args(
-        prompts=prompts,
+            prompts=prompts,
-        model_name=model,
+            model_name=model,
-        server_cli_args=server_args,
+            server_cli_args=server_args,
-        num_logprobs=None,
+            num_logprobs=None,
-        max_wait_seconds=5 * 240,
+            max_wait_seconds=5 * 240,
-        max_tokens=max_tokens)
+            max_tokens=max_tokens)
-    test_completions = await completions_with_server_args(
+        test_completions = await completions_with_server_args(
-        prompts=prompts,
+            prompts=prompts,
-        model_name=model,
+            model_name=model,
-        server_cli_args=ms_server_args,
+            server_cli_args=ms_server_args,
-        num_logprobs=None,
+            num_logprobs=None,
-        max_wait_seconds=5 * 240,
+            max_wait_seconds=5 * 240,
-        max_tokens=max_tokens)
+            max_tokens=max_tokens)
-    # Assert multi-step scheduling produces identical tokens
+        # Assert multi-step scheduling produces identical tokens
-    # to single-step scheduling.
+        # to single-step scheduling.
-    ref_generations = get_client_text_generations(ref_completions)
+        ref_generations = get_client_text_generations(ref_completions)
-    test_generations = get_client_text_generations(test_completions)
+        test_generations = get_client_text_generations(test_completions)
-    assert ref_generations == test_generations
+        assert ref_generations == test_generations
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@ -7,7 +7,7 @@ from typing import Optional
 import pytest
-from tests.kernels.utils import override_backend_env_variable
+from vllm.utils import STR_BACKEND_ENV_VAR
 from ..models.utils import check_logprobs_close, check_outputs_equal
@ -42,7 +42,7 @@ def test_multi_step_llm(
    num_prompts: int,
    num_logprobs: Optional[int],
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test vLLM engine with multi-step scheduling via sync LLM Engine.
@ -70,48 +70,49 @@ def test_multi_step_llm(
      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
                    completions endpoint; `None` -> 1 logprob returned.
    """
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-    prompts = example_prompts
+        prompts = example_prompts
-    if len(prompts) < num_prompts:
+        if len(prompts) < num_prompts:
-        prompts = prompts * ((num_prompts // len(prompts)) + 1)
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
-    prompts = prompts[:num_prompts]
+        prompts = prompts[:num_prompts]
-    assert len(prompts) == num_prompts
+        assert len(prompts) == num_prompts
-    with vllm_runner(
+        with vllm_runner(
-            model,
+                model,
-            dtype=dtype,
+                dtype=dtype,
-            enforce_eager=enforce_eager,
+                enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
+                gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
+                tensor_parallel_size=tp_size,
-            enable_chunked_prefill=enable_chunked_prefill,
+                enable_chunked_prefill=enable_chunked_prefill,
-            num_scheduler_steps=num_scheduler_steps,
+                num_scheduler_steps=num_scheduler_steps,
-    ) as vllm_model:
+        ) as vllm_model:
-        vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
+            vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
-                        if num_logprobs is None else
+                            if num_logprobs is None else
-                        vllm_model.generate_greedy_logprobs(
+                            vllm_model.generate_greedy_logprobs(
-                            prompts, max_tokens, num_logprobs))
+                                prompts, max_tokens, num_logprobs))
-    with hf_runner(model, dtype=dtype) as hf_model:
+        with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
+            hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
-                      if num_logprobs is None else
+                          if num_logprobs is None else
-                      hf_model.generate_greedy_logprobs_limit(
+                          hf_model.generate_greedy_logprobs_limit(
-                          prompts, max_tokens, num_logprobs))
+                              prompts, max_tokens, num_logprobs))
-    if num_logprobs is None:
+        if num_logprobs is None:
-        check_outputs_equal(
+            check_outputs_equal(
-            outputs_0_lst=hf_outputs,
+                outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
+                outputs_1_lst=vllm_outputs,
-            name_0="hf",
+                name_0="hf",
-            name_1="vllm",
+                name_1="vllm",
-        )
+            )
-    else:
+        else:
-        check_logprobs_close(
+            check_logprobs_close(
-            outputs_0_lst=hf_outputs,
+                outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
+                outputs_1_lst=vllm_outputs,
-            name_0="hf",
+                name_0="hf",
-            name_1="vllm",
+                name_1="vllm",
-        )
+            )
@pytest.mark.parametrize("model", MODELS)
@ -136,7 +137,7 @@ def test_multi_step_llm_w_prompt_logprobs(
    num_logprobs: Optional[int],
    num_prompt_logprobs: Optional[int],
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test prompt logprobs with multi-step scheduling via sync LLM Engine.
@ -166,47 +167,48 @@ def test_multi_step_llm_w_prompt_logprobs(
                           note that this argument is not supported by the
                           OpenAI completions endpoint.
    """
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-    prompts = example_prompts
+        prompts = example_prompts
-    if len(prompts) < num_prompts:
+        if len(prompts) < num_prompts:
-        prompts = prompts * ((num_prompts // len(prompts)) + 1)
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
-    prompts = prompts[:num_prompts]
+        prompts = prompts[:num_prompts]
-    assert len(prompts) == num_prompts
+        assert len(prompts) == num_prompts
-    with vllm_runner(
+        with vllm_runner(
-            model,
+                model,
-            dtype=dtype,
+                dtype=dtype,
-            enforce_eager=enforce_eager,
+                enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
+                gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
+                tensor_parallel_size=tp_size,
-            num_scheduler_steps=num_scheduler_steps,
+                num_scheduler_steps=num_scheduler_steps,
-    ) as vllm_model:
+        ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            vllm_outputs = vllm_model.generate_greedy_logprobs(
-            prompts,
+                prompts,
-            max_tokens,
+                max_tokens,
-            num_logprobs,
+                num_logprobs,
-            num_prompt_logprobs=num_prompt_logprobs)
+                num_prompt_logprobs=num_prompt_logprobs)
-    with vllm_runner(
+        with vllm_runner(
-            model,
+                model,
-            dtype=dtype,
+                dtype=dtype,
-            enforce_eager=enforce_eager,
+                enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
+                gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
+                tensor_parallel_size=tp_size,
-    ) as vllm_model:
+        ) as vllm_model:
-        single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
+            single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
-            prompts,
+                prompts,
-            max_tokens,
+                max_tokens,
-            num_logprobs,
+                num_logprobs,
-            num_prompt_logprobs=num_prompt_logprobs)
+                num_prompt_logprobs=num_prompt_logprobs)
-    check_logprobs_close(
+        check_logprobs_close(
-        outputs_0_lst=single_step_vllm_outputs,
+            outputs_0_lst=single_step_vllm_outputs,
-        outputs_1_lst=vllm_outputs,
+            outputs_1_lst=vllm_outputs,
-        name_0="hf",
+            name_0="hf",
-        name_1="vllm",
+            name_1="vllm",
-    )
+        )
@pytest.mark.parametrize("model", MODELS)
@ -230,7 +232,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
    num_prompts: int,
    num_logprobs: Optional[int],
    attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
@ -293,77 +295,78 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
    #
    # The Incorrect scheduling behavior - if it occurs - will cause an exception
    # in the model runner resulting from `do_sample=False`.
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-    assert len(example_prompts) >= 2
+        assert len(example_prompts) >= 2
-    challenge_prompts = copy.deepcopy(example_prompts)
+        challenge_prompts = copy.deepcopy(example_prompts)
-    challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
+        challenge_prompts[0] = (
-                            'inference and serving engine for LLMs.\n'
+            'vLLM is a high-throughput and memory-efficient '
-                            )  # 24 tok
+            'inference and serving engine for LLMs.\n')  # 24 tok
-    challenge_prompts[1] = (
+        challenge_prompts[1] = (
-        'Briefly describe the major milestones in the '
+            'Briefly describe the major milestones in the '
-        'development of artificial intelligence from 1950 to 2020.\n'
+            'development of artificial intelligence from 1950 to 2020.\n'
-    )  # 30 tok
+        )  # 30 tok
-    # If necessary, adjust the length of `challenge_prompts` to match
+        # If necessary, adjust the length of `challenge_prompts` to match
-    # `num_prompts`
+        # `num_prompts`
-    if len(challenge_prompts) < num_prompts:
+        if len(challenge_prompts) < num_prompts:
-        challenge_prompts = (challenge_prompts *
+            challenge_prompts = (challenge_prompts *
-                             ((num_prompts // len(challenge_prompts)) + 1))
+                                 ((num_prompts // len(challenge_prompts)) + 1))
-    challenge_prompts = challenge_prompts[:num_prompts]
+        challenge_prompts = challenge_prompts[:num_prompts]
-    assert len(challenge_prompts) == num_prompts
+        assert len(challenge_prompts) == num_prompts
-    # Single-step scheduler baseline
+        # Single-step scheduler baseline
-    with vllm_runner(
+        with vllm_runner(
-            model,
+                model,
-            dtype=dtype,
+                dtype=dtype,
-            enforce_eager=enforce_eager,
+                enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
+                gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
+                tensor_parallel_size=tp_size,
-            num_scheduler_steps=num_scheduler_steps,
+                num_scheduler_steps=num_scheduler_steps,
-            max_model_len=48,
+                max_model_len=48,
-            max_num_batched_tokens=48,
+                max_num_batched_tokens=48,
-            max_num_seqs=4,
+                max_num_seqs=4,
-            block_size=16,
+                block_size=16,
-    ) as vllm_model:
+        ) as vllm_model:
-        outputs_baseline = (vllm_model.generate_greedy(
+            outputs_baseline = (
-            challenge_prompts, max_tokens) if num_logprobs is None else
+                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
-                            vllm_model.generate_greedy_logprobs(
+                num_logprobs is None else vllm_model.generate_greedy_logprobs(
-                                challenge_prompts, max_tokens, num_logprobs))
+                    challenge_prompts, max_tokens, num_logprobs))
-    # multi-step+"single-step chunked prefill"+APC
+        # multi-step+"single-step chunked prefill"+APC
-    with vllm_runner(
+        with vllm_runner(
-            model,
+                model,
-            dtype=dtype,
+                dtype=dtype,
-            enforce_eager=enforce_eager,
+                enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
+                gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
+                tensor_parallel_size=tp_size,
-            enable_chunked_prefill=True,
+                enable_chunked_prefill=True,
-            enable_prefix_caching=True,
+                enable_prefix_caching=True,
-            num_scheduler_steps=num_scheduler_steps,
+                num_scheduler_steps=num_scheduler_steps,
-            max_model_len=48,
+                max_model_len=48,
-            max_num_batched_tokens=48,
+                max_num_batched_tokens=48,
-            max_num_seqs=4,
+                max_num_seqs=4,
-            block_size=16,
+                block_size=16,
-    ) as vllm_model:
+        ) as vllm_model:
-        outputs_w_features = (vllm_model.generate_greedy(
+            outputs_w_features = (
-            challenge_prompts, max_tokens) if num_logprobs is None else
+                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
-                              vllm_model.generate_greedy_logprobs(
+                num_logprobs is None else vllm_model.generate_greedy_logprobs(
-                                  challenge_prompts, max_tokens, num_logprobs))
+                    challenge_prompts, max_tokens, num_logprobs))
-    if num_logprobs is None:
+        if num_logprobs is None:
-        # No-logprobs test
+            # No-logprobs test
-        check_outputs_equal(
+            check_outputs_equal(
-            outputs_0_lst=outputs_baseline,
+                outputs_0_lst=outputs_baseline,
-            outputs_1_lst=outputs_w_features,
+                outputs_1_lst=outputs_w_features,
-            name_0="multi-step",
+                name_0="multi-step",
-            name_1="multi-step+features",
+                name_1="multi-step+features",
-        )
+            )
-    else:
+        else:
-        # Yes-logprobs test
+            # Yes-logprobs test
-        check_logprobs_close(
+            check_logprobs_close(
-            outputs_0_lst=outputs_baseline,
+                outputs_0_lst=outputs_baseline,
-            outputs_1_lst=outputs_w_features,
+                outputs_1_lst=outputs_w_features,
-            name_0="multi-step",
+                name_0="multi-step",
-            name_1="multi-step+features",
+                name_1="multi-step+features",
-        )
+            )
--- a/tests/neuron/1_core/test_block_table.py
+++ b/tests/neuron/1_core/test_block_table.py
@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import neuronxcc.nki.language as nl
 import pytest
@ -99,6 +98,7 @@ def ref_block_tables_transform(
 )
@torch.inference_mode()
 def test_load_and_transform_block_tables(
    monkeypatch: pytest.MonkeyPatch,
    num_tiles,
    num_blocks_per_tile,
    q_head_per_kv_head,
@ -108,46 +108,46 @@ def test_load_and_transform_block_tables(
    device = xm.xla_device()
-    compiler_flags = [
+    compiler_flags_str = " ".join([
        "-O1",
        "--retry_failed_compilation",
-    ]
+    ])
-    compiler_flags_str = " ".join(compiler_flags)
+    with monkeypatch.context() as m:
-    os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
+        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
-    torch.manual_seed(10000)
+        torch.manual_seed(10000)
-    torch.set_printoptions(sci_mode=False)
+        torch.set_printoptions(sci_mode=False)
-    # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
+        # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
-    B_P_SIZE = 128
+        B_P_SIZE = 128
-    if num_blocks_per_tile < B_P_SIZE:
+        if num_blocks_per_tile < B_P_SIZE:
-        assert B_P_SIZE % num_blocks_per_tile == 0
+            assert B_P_SIZE % num_blocks_per_tile == 0
-        block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
+            block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
-    else:
+        else:
-        block_size_tiling_factor = 1
+            block_size_tiling_factor = 1
-    max_num_blocks = 100000
+        max_num_blocks = 100000
-    block_tables = torch.randint(
+        block_tables = torch.randint(
-        0,
+            0,
-        max_num_blocks,
+            max_num_blocks,
-        (num_tiles * num_blocks_per_tile, ),
+            (num_tiles * num_blocks_per_tile, ),
-        dtype=torch.int32,
+            dtype=torch.int32,
-    )
+        )
-    nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
+        nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
-        block_tables.to(device=device),
+            block_tables.to(device=device),
-        num_tiles,
+            num_tiles,
-        num_blocks_per_tile,
+            num_blocks_per_tile,
-        q_head_per_kv_head,
+            q_head_per_kv_head,
-        head_id,
+            head_id,
-        block_size_tiling_factor,
+            block_size_tiling_factor,
-    ).cpu()
+        ).cpu()
-    ref_out = ref_block_tables_transform(
+        ref_out = ref_block_tables_transform(
-        block_tables,
+            block_tables,
-        num_tiles,
+            num_tiles,
-        num_blocks_per_tile,
+            num_blocks_per_tile,
-        q_head_per_kv_head,
+            q_head_per_kv_head,
-        head_id,
+            head_id,
-        block_size_tiling_factor,
+            block_size_tiling_factor,
-    )
+        )
-    assert (nki_out.shape == ref_out.shape
+        assert (nki_out.shape == ref_out.shape
-            ), f"{nki_out.shape=} != {ref_out.shape=}"
+                ), f"{nki_out.shape=} != {ref_out.shape=}"
-    assert torch.all(nki_out == ref_out)
+        assert torch.all(nki_out == ref_out)
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@ -320,6 +320,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
    ])
@torch.inference_mode()
 def test_contexted_kv_attention(
    monkeypatch: pytest.MonkeyPatch,
    prefill_batch_size: int,
    decode_batch_size: int,
    num_heads: int,
@ -329,7 +330,6 @@ def test_contexted_kv_attention(
    large_tile_size,
    mixed_precision: bool,
 ) -> None:
    import os
    import torch_xla.core.xla_model as xm
@ -340,174 +340,178 @@ def test_contexted_kv_attention(
    device = xm.xla_device()
-    compiler_flags = [
+    compiler_flags_str = " ".join([
        "-O1",
        "--retry_failed_compilation",
-    ]
+    ])
-    compiler_flags_str = " ".join(compiler_flags)
+    with monkeypatch.context() as m:
-    os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
+        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
-    torch.manual_seed(0)
+        torch.manual_seed(0)
-    torch.set_printoptions(sci_mode=False)
+        torch.set_printoptions(sci_mode=False)
-    torch.set_default_device("cpu")
+        torch.set_default_device("cpu")
-    dtype = torch.float32
+        dtype = torch.float32
-    min_ctx_len = 32
+        min_ctx_len = 32
-    max_ctx_len = 1024
+        max_ctx_len = 1024
-    min_query_len = 16
+        min_query_len = 16
-    max_query_len = 512
+        max_query_len = 512
-    num_kv_heads = num_heads // num_queries_per_kv
+        num_kv_heads = num_heads // num_queries_per_kv
-    (
+        (
-        query,
+            query,
-        k_active,
+            k_active,
-        v_active,
+            v_active,
-        k_cache,
+            k_cache,
-        v_cache,
+            v_cache,
-        block_table,
+            block_table,
-        key,
+            key,
-        value,
+            value,
-        query_lens,
+            query_lens,
-        seq_lens,
+            seq_lens,
-    ) = sample_inputs(
+        ) = sample_inputs(
-        prefill_batch_size=prefill_batch_size,
+            prefill_batch_size=prefill_batch_size,
-        decode_batch_size=decode_batch_size,
+            decode_batch_size=decode_batch_size,
-        min_query_len=min_query_len,
+            min_query_len=min_query_len,
-        max_query_len=max_query_len,
+            max_query_len=max_query_len,
-        min_ctx_len=min_ctx_len,
+            min_ctx_len=min_ctx_len,
-        max_ctx_len=max_ctx_len,
+            max_ctx_len=max_ctx_len,
-        block_size=block_size,
+            block_size=block_size,
-        num_heads=num_heads,
+            num_heads=num_heads,
-        num_kv_heads=num_kv_heads,
+            num_kv_heads=num_kv_heads,
-        head_size=head_size,
+            head_size=head_size,
-        dtype=dtype,
+            dtype=dtype,
-    )
+        )
-    output_ref = ref_context_attention(
+        output_ref = ref_context_attention(
-        query,
+            query,
-        key,
+            key,
-        value,
+            value,
-        query_lens,
+            query_lens,
-        seq_lens,
+            seq_lens,
-        head_size,
+            head_size,
-        num_queries_per_kv,
+            num_queries_per_kv,
-        return_max_reduce=False,
+            return_max_reduce=False,
-    )
+        )
-    # build neuron program
+        # build neuron program
-    B_P_SIZE = 128
+        B_P_SIZE = 128
-    assert (large_tile_size >= B_P_SIZE
+        assert (large_tile_size >= B_P_SIZE
-            ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
+                ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
-    def ceil_div(a, b):
+        def ceil_div(a, b):
-        return (a + b - 1) // b
+            return (a + b - 1) // b
-    def pad_to_multiple(a, b):
+        def pad_to_multiple(a, b):
-        return ceil_div(a, b) * b
+            return ceil_div(a, b) * b
-    def pad_to_next_power_of_2(a):
+        def pad_to_next_power_of_2(a):
-        assert a > 0
+            assert a > 0
-        return 2**int(a - 1).bit_length()
+            return 2**int(a - 1).bit_length()
-    # calculate input shapes
+        # calculate input shapes
-    max_num_queries = pad_to_next_power_of_2(sum(query_lens))
+        max_num_queries = pad_to_next_power_of_2(sum(query_lens))
-    context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
-    num_active_blocks = ceil_div(context_lens, block_size).sum().item()
+        num_active_blocks = ceil_div(context_lens, block_size).sum().item()
-    num_active_blocks = pad_to_multiple(num_active_blocks,
+        num_active_blocks = pad_to_multiple(num_active_blocks,
-                                        large_tile_size // block_size)
+                                            large_tile_size // block_size)
-    context_kv_len = num_active_blocks * block_size
+        context_kv_len = num_active_blocks * block_size
-    assert (context_kv_len %
+        assert (
            context_kv_len %
            large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
-    # pad QKV tensors
+        # pad QKV tensors
-    pad_dims = (
+        pad_dims = (
        0,
        0,
        0,
        0,
        0,
        max_num_queries - query.shape[0],
    )
    query = F.pad(query, pad_dims, "constant", 0)
    k = F.pad(k_active, pad_dims, "constant", 0)
    v = F.pad(v_active, pad_dims, "constant", 0)
    # permute QKV tensors
    # query: (1, n_heads, d, seq_q)
    # key:   (1, n_kv_heads, d, seq_k)
    # value: (1, n_kv_heads, seq_v, d)
    query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
    k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
    v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
    k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
    v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
    # transform block table
    active_block_table = get_active_block_tables(
        block_table.cpu(),
        torch.tensor(query_lens).cpu(),
        torch.tensor(seq_lens).cpu(),
        block_size,
        num_active_blocks,
    )
    # Build attention masks
    prior_mask, active_mask = (
        BlockDiagonalCausalFromBottomRightMask.from_seqlens(
            query_lens, seq_lens, block_size=block_size))
    prior_mask_padded = F.pad(
        prior_mask,
        (
            0,
            context_kv_len - prior_mask.shape[1],
            0,
            max_num_queries - prior_mask.shape[0],
        ),
        "constant",
        0,
    ).bool()
    active_mask_padded = F.pad(
        active_mask,
        (
            0,
            max_num_queries - active_mask.shape[1],
            0,
-            max_num_queries - active_mask.shape[0],
+            0,
-        ),
+            max_num_queries - query.shape[0],
-        "constant",
+        )
-        0,
+        query = F.pad(query, pad_dims, "constant", 0)
-    ).bool()
+        k = F.pad(k_active, pad_dims, "constant", 0)
-    attn_mask = torch.concat([prior_mask_padded, active_mask_padded], dim=1)
+        v = F.pad(v_active, pad_dims, "constant", 0)
-    attn_mask = reorder_context_mask(attn_mask, large_tile_size, block_size)
+        # permute QKV tensors
        # query: (1, n_heads, d, seq_q)
        # key:   (1, n_kv_heads, d, seq_k)
        # value: (1, n_kv_heads, seq_v, d)
        query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
        k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
        v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
        k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
        v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
-    input_args = (
+        # transform block table
-        query.to(device=device),
+        active_block_table = get_active_block_tables(
-        k.to(device=device),
+            block_table.cpu(),
-        v.to(device=device),
+            torch.tensor(query_lens).cpu(),
-        k_cache.to(device=device),
+            torch.tensor(seq_lens).cpu(),
-        v_cache.to(device=device),
+            block_size,
-        active_block_table.to(device=device),
+            num_active_blocks,
-        attn_mask.to(device=device),
+        )
    )
    input_kwargs = dict(
        n_kv_head=num_kv_heads,
        head_size=head_size,
        mixed_precision=mixed_precision,
        LARGE_TILE_SZ=large_tile_size,
    )
-    output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
+        # Build attention masks
        prior_mask, active_mask = (
            BlockDiagonalCausalFromBottomRightMask.from_seqlens(
                query_lens, seq_lens, block_size=block_size))
        prior_mask_padded = F.pad(
            prior_mask,
            (
                0,
                context_kv_len - prior_mask.shape[1],
                0,
                max_num_queries - prior_mask.shape[0],
            ),
            "constant",
            0,
        ).bool()
        active_mask_padded = F.pad(
            active_mask,
            (
                0,
                max_num_queries - active_mask.shape[1],
                0,
                max_num_queries - active_mask.shape[0],
            ),
            "constant",
            0,
        ).bool()
        attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
                                 dim=1)
-    num_actual_tokens = sum(query_lens)
+        attn_mask = reorder_context_mask(attn_mask, large_tile_size,
-    # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
+                                         block_size)
    output_nki = output_nki.cpu().permute(0, 2, 1, 3)
    output_nki = output_nki[0, :num_actual_tokens, :, :]
    output_ref_padded = F.pad(
        output_ref,
        (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
        "constant",
        0,
    )
    output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :]
-    torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
+        input_args = (
            query.to(device=device),
            k.to(device=device),
            v.to(device=device),
            k_cache.to(device=device),
            v_cache.to(device=device),
            active_block_table.to(device=device),
            attn_mask.to(device=device),
        )
        input_kwargs = dict(
            n_kv_head=num_kv_heads,
            head_size=head_size,
            mixed_precision=mixed_precision,
            LARGE_TILE_SZ=large_tile_size,
        )
        output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
        num_actual_tokens = sum(query_lens)
        # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
        output_nki = output_nki.cpu().permute(0, 2, 1, 3)
        output_nki = output_nki[0, :num_actual_tokens, :, :]
        output_ref_padded = F.pad(
            output_ref,
            (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
            "constant",
            0,
        )
        output_ref = output_ref_padded.transpose(
            0, 1)[0, :num_actual_tokens, :, :]
        torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 import pytest
 import torch
 from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import get_attn_backend
-from vllm.utils import STR_INVALID_VAL
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL
 def test_platform_plugins():
@ -25,8 +25,9 @@ def test_platform_plugins():
        f" is loaded. The first import:\n{_init_trace}")
-def test_oot_attention_backend(monkeypatch):
+def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
    # ignore the backend env variable if it is set
-    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
+    with monkeypatch.context() as m:
-    backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
+        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
-    assert backend.get_name() == "Dummy_Backend"
+        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
        assert backend.get_name() == "Dummy_Backend"
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@ -22,43 +22,47 @@ class DummyV1Scheduler(V1Scheduler):
        raise Exception("Exception raised by DummyV1Scheduler")
-def test_scheduler_plugins_v0(monkeypatch):
+def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    with monkeypatch.context() as m:
-    with pytest.raises(Exception) as exception_info:
+        m.setenv("VLLM_USE_V1", "0")
        with pytest.raises(Exception) as exception_info:
-        engine_args = EngineArgs(
+            engine_args = EngineArgs(
-            model="facebook/opt-125m",
+                model="facebook/opt-125m",
-            enforce_eager=True,  # reduce test time
+                enforce_eager=True,  # reduce test time
-            scheduler_cls=DummyV0Scheduler,
+                scheduler_cls=DummyV0Scheduler,
-        )
+            )
-        engine = LLMEngine.from_engine_args(engine_args=engine_args)
+            engine = LLMEngine.from_engine_args(engine_args=engine_args)
-        sampling_params = SamplingParams(max_tokens=1)
+            sampling_params = SamplingParams(max_tokens=1)
-        engine.add_request("0", "foo", sampling_params)
+            engine.add_request("0", "foo", sampling_params)
-        engine.step()
+            engine.step()
-    assert str(exception_info.value) == "Exception raised by DummyV0Scheduler"
+        assert str(
            exception_info.value) == "Exception raised by DummyV0Scheduler"
-def test_scheduler_plugins_v1(monkeypatch):
+def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
+    with monkeypatch.context() as m:
-    # Explicitly turn off engine multiprocessing so that the scheduler runs in
+        m.setenv("VLLM_USE_V1", "1")
-    # this process
+        # Explicitly turn off engine multiprocessing so
-    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+        # that the scheduler runs in this process
        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
-    with pytest.raises(Exception) as exception_info:
+        with pytest.raises(Exception) as exception_info:
-        engine_args = EngineArgs(
+            engine_args = EngineArgs(
-            model="facebook/opt-125m",
+                model="facebook/opt-125m",
-            enforce_eager=True,  # reduce test time
+                enforce_eager=True,  # reduce test time
-            scheduler_cls=DummyV1Scheduler,
+                scheduler_cls=DummyV1Scheduler,
-        )
+            )
-        engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
+            engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
-        sampling_params = SamplingParams(max_tokens=1)
+            sampling_params = SamplingParams(max_tokens=1)
-        engine.add_request("0", "foo", sampling_params)
+            engine.add_request("0", "foo", sampling_params)
-        engine.step()
+            engine.step()
-    assert str(exception_info.value) == "Exception raised by DummyV1Scheduler"
+        assert str(
            exception_info.value) == "Exception raised by DummyV1Scheduler"
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@ -4,25 +4,29 @@
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 """
 from __future__ import annotations
 import pytest
 from tests.conftest import VllmRunner
 from tests.core.utils import SchedulerProxy, create_dummy_prompt
 from tests.kernels.utils import override_backend_env_variable
 from vllm import SamplingParams, TokensPrompt
 from vllm.core.scheduler import Scheduler
 from vllm.engine.llm_engine import LLMEngine
 from vllm.platforms import current_platform
 from vllm.utils import STR_BACKEND_ENV_VAR
 from ..models.utils import check_outputs_equal
@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
    """
    This module relies on V0 internals, so set VLLM_USE_V1=0.
    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    with monkeypatch.context() as m:
        m.setenv('VLLM_USE_V1', '0')
        yield
 MODELS = [
@ -56,7 +60,7 @@ def test_mixed_requests(
    cached_position: int,
    enable_chunked_prefill: bool,
    block_size: int,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """
    Test the case when some sequences have the prefix cache hit
@ -67,72 +71,77 @@ def test_mixed_requests(
        pytest.skip("Flashinfer does not support ROCm/HIP.")
    if backend == "XFORMERS" and current_platform.is_rocm():
        pytest.skip("Xformers does not support ROCm/HIP.")
-    override_backend_env_variable(monkeypatch, backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, backend)
-    with hf_runner(model, dtype=dtype) as hf_model:
+        with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    cached_prompt = example_prompts[cached_position]
+        cached_prompt = example_prompts[cached_position]
-    with vllm_runner(
+        with vllm_runner(
-            model,
+                model,
-            dtype=dtype,
+                dtype=dtype,
-            enable_prefix_caching=True,
+                enable_prefix_caching=True,
-            enable_chunked_prefill=enable_chunked_prefill,
+                enable_chunked_prefill=enable_chunked_prefill,
-            block_size=block_size,
+                block_size=block_size,
-    ) as vllm_model:
+        ) as vllm_model:
-        # Run the first prompt so the cache is populated
+            # Run the first prompt so the cache is populated
-        vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
+            vllm_outputs = vllm_model.generate_greedy([cached_prompt],
                                                      max_tokens)
-        # Run all the promopts
+            # Run all the promopts
-        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
+            greedy_params = SamplingParams(temperature=0.0,
-        req_outputs = vllm_model.model.generate(example_prompts, greedy_params)
+                                           max_tokens=max_tokens)
            req_outputs = vllm_model.model.generate(example_prompts,
                                                    greedy_params)
-        # Verify number of cached tokens
+            # Verify number of cached tokens
-        for i in range(len(req_outputs)):
+            for i in range(len(req_outputs)):
-            if i == cached_position:
+                if i == cached_position:
-                expected_num_cached_tokens = (
+                    expected_num_cached_tokens = (
-                    len(req_outputs[i].prompt_token_ids) //
+                        len(req_outputs[i].prompt_token_ids) //
-                    block_size) * block_size
+                        block_size) * block_size
-            else:
+                else:
-                expected_num_cached_tokens = 0
+                    expected_num_cached_tokens = 0
-            assert (
+                assert (req_outputs[i].num_cached_tokens ==
-                req_outputs[i].num_cached_tokens == expected_num_cached_tokens)
+                        expected_num_cached_tokens)
-        vllm_outputs = [(
+            vllm_outputs = [(
-            output.prompt_token_ids + list(output.outputs[0].token_ids),
+                output.prompt_token_ids + list(output.outputs[0].token_ids),
-            output.prompt + output.outputs[0].text,
+                output.prompt + output.outputs[0].text,
-        ) for output in req_outputs]
+            ) for output in req_outputs]
-    check_outputs_equal(
+        check_outputs_equal(
-        outputs_0_lst=hf_outputs,
+            outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
+            outputs_1_lst=vllm_outputs,
-        name_0="hf",
+            name_0="hf",
-        name_1="vllm",
+            name_1="vllm",
-    )
+        )
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
 def test_unstable_prompt_sequence(
    vllm_runner,
    backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    if backend == "FLASHINFER" and current_platform.is_rocm():
        pytest.skip("Flashinfer does not support ROCm/HIP.")
    if backend == "XFORMERS" and current_platform.is_rocm():
        pytest.skip("Xformers does not support ROCm/HIP.")
-    override_backend_env_variable(monkeypatch, backend)
+    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, backend)
-    with vllm_runner(
+        with vllm_runner(
-            "Qwen/Qwen2.5-0.5B-Instruct",
+                "Qwen/Qwen2.5-0.5B-Instruct",
-            enable_chunked_prefill=True,
+                enable_chunked_prefill=True,
-            enable_prefix_caching=True,
+                enable_prefix_caching=True,
-            max_model_len=4096,
+                max_model_len=4096,
-    ) as vllm_model:
+        ) as vllm_model:
-        for prompt in UNSTABLE_PROMPT_SEQUENCE:
+            for prompt in UNSTABLE_PROMPT_SEQUENCE:
-            vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
+                vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
-                                SamplingParams(max_tokens=1))
+                                    SamplingParams(max_tokens=1))
@pytest.mark.parametrize("model", MODELS)
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@ -10,7 +10,8 @@ import pytest
 import torch
 from tests.quantization.utils import is_quant_method_supported
-from tests.utils import compare_two_settings, fork_new_process_for_each_test
+
 from ..utils import compare_two_settings, create_new_process_for_each_test
 models_4bit_to_test = [
    ("facebook/opt-125m", "quantize opt model inflight"),
@ -32,7 +33,7 @@ models_pre_quant_8bit_to_test = [
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name, description) -> None:
@ -45,7 +46,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
                         models_pre_qaunt_4bit_to_test)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                       model_name, description) -> None:
@ -57,7 +58,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
                         models_pre_quant_8bit_to_test)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name, description) -> None:
@ -70,7 +71,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                model_name, description) -> None:
@ -88,7 +89,7 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_load_pp_4bit_bnb_model(model_name, description) -> None:
    common_args = [
        "--disable-log-stats",
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@ -42,7 +42,7 @@ from transformers import AutoTokenizer
 from vllm import SamplingParams
-from ...utils import fork_new_process_for_each_test
+from ...utils import create_new_process_for_each_test
 from .conftest import (get_output_from_llm_generator,
                       run_equality_correctness_test)
@ -82,7 +82,7 @@ from .conftest import (get_output_from_llm_generator,
@pytest.mark.parametrize("test_llm_kwargs", [{}])
@pytest.mark.parametrize("batch_size", [1, 32])
@pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_e2e_with_detokenization(test_llm_generator,
                                             batch_size: int):
    """Run generation with speculative decoding on a batch. Verify the engine
@ -170,7 +170,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
    ])
@pytest.mark.parametrize("batch_size", [1])
@pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
@ -244,7 +244,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
    ])
@pytest.mark.parametrize("batch_size", [64])
@pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
@ -300,7 +300,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
 ])
@pytest.mark.parametrize("batch_size", [32])
@pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
        baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
@ -356,7 +356,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
        256,
    ])
@pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
@ -411,7 +411,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
        64,
    ])
@pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
@ -469,7 +469,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
    ])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_with_preemption(
        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
@ -534,7 +534,7 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
                                          per_test_common_llm_kwargs,
                                          baseline_llm_kwargs, test_llm_kwargs,
@ -594,7 +594,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
        64,
    ])
@pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_skip_speculation(vllm_runner, common_llm_kwargs,
                          per_test_common_llm_kwargs, baseline_llm_kwargs,
                          test_llm_kwargs, batch_size: int, output_len: int,
@ -644,7 +644,7 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize("output_len", [10])
@pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_disable_speculation(vllm_runner, common_llm_kwargs,
                             per_test_common_llm_kwargs, baseline_llm_kwargs,
                             test_llm_kwargs, batch_size: int, output_len: int,
@ -697,7 +697,7 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
                output_len: int, seed: int):
@ -752,7 +752,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
        32,
    ])
@pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_typical_acceptance_sampling(vllm_runner, common_llm_kwargs,
                                     per_test_common_llm_kwargs,
                                     baseline_llm_kwargs, test_llm_kwargs,
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@ -56,12 +56,11 @@ def test_gc():
    assert allocated < 50 * 1024 * 1024
-def test_model_from_modelscope(monkeypatch):
+def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
    # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
-    MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat"
+    with monkeypatch.context() as m:
-    monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True")
+        m.setenv("VLLM_USE_MODELSCOPE", "True")
-    try:
+        llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
        llm = LLM(model=MODELSCOPE_MODEL_NAME)
        prompts = [
            "Hello, my name is",
@ -73,10 +72,3 @@ def test_model_from_modelscope(monkeypatch):
        outputs = llm.generate(prompts, sampling_params)
        assert len(outputs) == 4
    finally:
        monkeypatch.delenv("VLLM_USE_MODELSCOPE", raising=False)
 if __name__ == "__main__":
    import pytest
    pytest.main([__file__])
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # ruff: noqa
 import asyncio
 import os
 import socket
 from collections.abc import AsyncIterator
 from unittest.mock import patch
@ -16,7 +16,7 @@ from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
                        deprecate_kwargs, get_open_port, memory_profiling,
                        merge_async_iterators, supports_kw, swap_dict_values)
-from .utils import error_on_warning, fork_new_process_for_each_test
+from .utils import create_new_process_for_each_test, error_on_warning
@pytest.mark.asyncio
@ -112,16 +112,16 @@ def test_deprecate_kwargs_additional_message():
        dummy(old_arg=1)
-def test_get_open_port():
+def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
-    os.environ["VLLM_PORT"] = "5678"
+    with monkeypatch.context() as m:
-    # make sure we can get multiple ports, even if the env var is set
+        m.setenv("VLLM_PORT", "5678")
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
+        # make sure we can get multiple ports, even if the env var is set
-        s1.bind(("localhost", get_open_port()))
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
+            s1.bind(("localhost", get_open_port()))
-            s2.bind(("localhost", get_open_port()))
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
-            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
+                s2.bind(("localhost", get_open_port()))
-                s3.bind(("localhost", get_open_port()))
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
-    os.environ.pop("VLLM_PORT")
+                    s3.bind(("localhost", get_open_port()))
 # Tests for FlexibleArgumentParser
@ -276,7 +276,7 @@ def test_supports_kw(callable,kw_name,requires_kw_only,
    ) == is_supported
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_memory_profiling():
    # Fake out some model loading + inference memory usage to test profiling
    # Memory used by other processes will show up as cuda usage outside of torch
@ -366,31 +366,32 @@ def test_bind_kv_cache_non_attention():
    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
-def test_bind_kv_cache_encoder_decoder(monkeypatch):
+def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
    # V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "0")
-    from vllm.attention import Attention, AttentionType
+        from vllm.attention import Attention, AttentionType
-    # example from bart
+        # example from bart
-    ctx = {
+        ctx = {
-        'encoder.layers.0.self_attn.attn':
+            'encoder.layers.0.self_attn.attn':
-            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
+                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
-        'decoder.layers.0.encoder_attn.attn':
+            'decoder.layers.0.encoder_attn.attn':
-            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
+                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
-        'decoder.layers.0.self_attn.attn':
+            'decoder.layers.0.self_attn.attn':
-            Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
+                Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
-    }
+        }
-    kv_cache = [
+        kv_cache = [
-        torch.zeros((1, )),
+            torch.zeros((1, )),
-    ]
+        ]
-    encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
+        encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
-    bind_kv_cache(ctx, [kv_cache])
+        bind_kv_cache(ctx, [kv_cache])
-    assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
+        assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
-    assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
+        assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
-    assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
+        assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
 def test_bind_kv_cache_pp():
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@ -46,6 +46,7 @@ CONFIGS: dict[str, ServerConfig] = {
        "model":
        "NousResearch/Hermes-3-Llama-3.1-8B",
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "hermes", "--chat-template",
            str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
        ],
@ -60,6 +61,7 @@ CONFIGS: dict[str, ServerConfig] = {
        "model":
        "meta-llama/Meta-Llama-3.1-8B-Instruct",
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "llama3_json", "--chat-template",
            str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja")
        ],
@ -70,6 +72,7 @@ CONFIGS: dict[str, ServerConfig] = {
        "model":
        "meta-llama/Llama-3.2-3B-Instruct",
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "llama3_json", "--chat-template",
            str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja")
        ],
@ -80,6 +83,7 @@ CONFIGS: dict[str, ServerConfig] = {
        "model":
        "mistralai/Mistral-7B-Instruct-v0.3",
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "mistral", "--chat-template",
            str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"),
            "--ignore-patterns=\"consolidated.safetensors\""
@ -111,22 +115,28 @@ CONFIGS: dict[str, ServerConfig] = {
        "model":
        "ibm-granite/granite-3.0-8b-instruct",
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "granite", "--chat-template",
            str(VLLM_PATH / "examples/tool_chat_template_granite.jinja")
        ],
    },
    "granite-3.1-8b": {
-        "model": "ibm-granite/granite-3.1-8b-instruct",
+        "model":
        "ibm-granite/granite-3.1-8b-instruct",
        "arguments": [
            "--enforce-eager",
            "--no-enable-prefix-caching",
            "--tool-call-parser",
            "granite",
        ],
-        "supports_parallel": True,
+        "supports_parallel":
        True,
    },
    "internlm": {
        "model":
        "internlm/internlm2_5-7b-chat",
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "internlm", "--chat-template",
            str(VLLM_PATH /
                "examples/tool_chat_template_internlm2_tool.jinja"),
@ -139,6 +149,7 @@ CONFIGS: dict[str, ServerConfig] = {
        "model":
        "Team-ACE/ToolACE-8B",
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "pythonic", "--chat-template",
            str(VLLM_PATH / "examples/tool_chat_template_toolace.jinja")
        ],
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-import os
+import pytest
 from vllm.config import CompilationLevel
@ -9,16 +9,17 @@ from ..utils import compare_two_settings
 # --enforce-eager on TPU causes graph compilation
 # this times out default Health Check in the MQLLMEngine,
 # so we set the timeout here to 30s
 os.environ["VLLM_RPC_TIMEOUT"] = "30000"
-def test_custom_dispatcher():
+def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
-    compare_two_settings(
+    with monkeypatch.context() as m:
-        "google/gemma-2b",
+        m.setenv("VLLM_RPC_TIMEOUT", "30000")
-        arg1=[
+        compare_two_settings(
-            "--enforce-eager",
+            "google/gemma-2b",
-            f"-O{CompilationLevel.DYNAMO_ONCE}",
+            arg1=[
-        ],
+                "--enforce-eager",
-        arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
+                f"-O{CompilationLevel.DYNAMO_ONCE}",
-        env1={},
+            ],
-        env2={})
+            arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
            env1={},
            env2={})
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # ruff: noqa
 # type: ignore
 from __future__ import annotations
 import os
 import threading
 from collections.abc import Iterable
 from concurrent import futures
-from typing import Callable, Literal
+from typing import Callable, Generator, Literal
 import grpc
 import pytest
@ -21,12 +23,14 @@ from vllm.tracing import SpanAttributes
@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
    """
    Since this module is V0 only, set VLLM_USE_V1=0 for
    all tests in the module.
    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    with monkeypatch.context() as m:
        m.setenv('VLLM_USE_V1', '0')
        yield
 FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
@ -67,7 +71,7 @@ class FakeTraceService(TraceServiceServicer):
@pytest.fixture
-def trace_service():
+def trace_service() -> Generator[FakeTraceService, None, None]:
    """Fixture to set up a fake gRPC trace service"""
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
    service = FakeTraceService()
@ -80,136 +84,153 @@ def trace_service():
    server.stop(None)
-def test_traces(trace_service):
+def test_traces(
-    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
+    monkeypatch: pytest.MonkeyPatch,
    trace_service: FakeTraceService,
 ):
    with monkeypatch.context() as m:
        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-    sampling_params = SamplingParams(temperature=0.01,
+        sampling_params = SamplingParams(
-                                     top_p=0.1,
+            temperature=0.01,
-                                     max_tokens=256)
+            top_p=0.1,
-    model = "facebook/opt-125m"
+            max_tokens=256,
-    llm = LLM(
+        )
-        model=model,
+        model = "facebook/opt-125m"
-        otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+        llm = LLM(
-    )
+            model=model,
-    prompts = ["This is a short prompt"]
+            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-    outputs = llm.generate(prompts, sampling_params=sampling_params)
+        )
        prompts = ["This is a short prompt"]
        outputs = llm.generate(prompts, sampling_params=sampling_params)
-    timeout = 5
+        timeout = 5
-    if not trace_service.evt.wait(timeout):
+        if not trace_service.evt.wait(timeout):
-        raise TimeoutError(
+            raise TimeoutError(
-            f"The fake trace service didn't receive a trace within "
+                f"The fake trace service didn't receive a trace within "
-            f"the {timeout} seconds timeout")
+                f"the {timeout} seconds timeout")
-    request = trace_service.request
+        request = trace_service.request
-    assert len(request.resource_spans) == 1, (
+        assert len(request.resource_spans) == 1, (
-        f"Expected 1 resource span, "
+            f"Expected 1 resource span, "
-        f"but got {len(request.resource_spans)}")
+            f"but got {len(request.resource_spans)}")
-    assert len(request.resource_spans[0].scope_spans) == 1, (
+        assert len(request.resource_spans[0].scope_spans) == 1, (
-        f"Expected 1 scope span, "
+            f"Expected 1 scope span, "
-        f"but got {len(request.resource_spans[0].scope_spans)}")
+            f"but got {len(request.resource_spans[0].scope_spans)}")
-    assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
-        f"Expected 1 span, "
+            f"Expected 1 span, "
-        f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
-    attributes = decode_attributes(
+        attributes = decode_attributes(
-        request.resource_spans[0].scope_spans[0].spans[0].attributes)
+            request.resource_spans[0].scope_spans[0].spans[0].attributes)
-    assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-    assert attributes.get(
+        assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
-                          ) == sampling_params.temperature
+                              ) == sampling_params.temperature
-    assert attributes.get(
+        assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-    assert attributes.get(
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
-        SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
+                              ) == sampling_params.max_tokens
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
-    assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-        outputs[0].prompt_token_ids)
+        assert attributes.get(
-    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
-    assert attributes.get(
+                outputs[0].prompt_token_ids)
-        SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-    metrics = outputs[0].metrics
+        assert attributes.get(
-    assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
-        SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
+        metrics = outputs[0].metrics
-    ttft = metrics.first_token_time - metrics.arrival_time
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
-    assert attributes.get(
+                              ) == metrics.time_in_queue
-        SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        ttft = metrics.first_token_time - metrics.arrival_time
-    e2e_time = metrics.finished_time - metrics.arrival_time
+        assert attributes.get(
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
-    assert metrics.scheduler_time > 0
+        e2e_time = metrics.finished_time - metrics.arrival_time
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
-                          ) == metrics.scheduler_time
+        assert metrics.scheduler_time > 0
-    # Model forward and model execute should be none, since detailed traces is
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
-    # not enabled.
+                              ) == metrics.scheduler_time
-    assert metrics.model_forward_time is None
+        # Model forward and model execute should be none, since detailed traces is
-    assert metrics.model_execute_time is None
+        # not enabled.
        assert metrics.model_forward_time is None
        assert metrics.model_execute_time is None
-def test_traces_with_detailed_steps(trace_service):
+def test_traces_with_detailed_steps(
-    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
+    monkeypatch: pytest.MonkeyPatch,
    trace_service: FakeTraceService,
 ):
    with monkeypatch.context() as m:
        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
-    sampling_params = SamplingParams(temperature=0.01,
+        sampling_params = SamplingParams(
-                                     top_p=0.1,
+            temperature=0.01,
-                                     max_tokens=256)
+            top_p=0.1,
-    model = "facebook/opt-125m"
+            max_tokens=256,
-    llm = LLM(
+        )
-        model=model,
+        model = "facebook/opt-125m"
-        otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+        llm = LLM(
-        collect_detailed_traces="all",
+            model=model,
-    )
+            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-    prompts = ["This is a short prompt"]
+            collect_detailed_traces="all",
-    outputs = llm.generate(prompts, sampling_params=sampling_params)
+        )
        prompts = ["This is a short prompt"]
        outputs = llm.generate(prompts, sampling_params=sampling_params)
-    timeout = 5
+        timeout = 5
-    if not trace_service.evt.wait(timeout):
+        if not trace_service.evt.wait(timeout):
-        raise TimeoutError(
+            raise TimeoutError(
-            f"The fake trace service didn't receive a trace within "
+                f"The fake trace service didn't receive a trace within "
-            f"the {timeout} seconds timeout")
+                f"the {timeout} seconds timeout")
-    request = trace_service.request
+        request = trace_service.request
-    assert len(request.resource_spans) == 1, (
+        assert len(request.resource_spans) == 1, (
-        f"Expected 1 resource span, "
+            f"Expected 1 resource span, "
-        f"but got {len(request.resource_spans)}")
+            f"but got {len(request.resource_spans)}")
-    assert len(request.resource_spans[0].scope_spans) == 1, (
+        assert len(request.resource_spans[0].scope_spans) == 1, (
-        f"Expected 1 scope span, "
+            f"Expected 1 scope span, "
-        f"but got {len(request.resource_spans[0].scope_spans)}")
+            f"but got {len(request.resource_spans[0].scope_spans)}")
-    assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
-        f"Expected 1 span, "
+            f"Expected 1 span, "
-        f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
-    attributes = decode_attributes(
+        attributes = decode_attributes(
-        request.resource_spans[0].scope_spans[0].spans[0].attributes)
+            request.resource_spans[0].scope_spans[0].spans[0].attributes)
-    assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-    assert attributes.get(
+        assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
-                          ) == sampling_params.temperature
+                              ) == sampling_params.temperature
-    assert attributes.get(
+        assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-    assert attributes.get(
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
-        SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
+                              ) == sampling_params.max_tokens
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
-    assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-        outputs[0].prompt_token_ids)
+        assert attributes.get(
-    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
-    assert attributes.get(
+                outputs[0].prompt_token_ids)
-        SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-    metrics = outputs[0].metrics
+        assert attributes.get(
-    assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
-        SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
+        metrics = outputs[0].metrics
-    ttft = metrics.first_token_time - metrics.arrival_time
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
-    assert attributes.get(
+                              ) == metrics.time_in_queue
-        SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        ttft = metrics.first_token_time - metrics.arrival_time
-    e2e_time = metrics.finished_time - metrics.arrival_time
+        assert attributes.get(
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
-    assert metrics.scheduler_time > 0
+        e2e_time = metrics.finished_time - metrics.arrival_time
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
-                          ) == metrics.scheduler_time
+        assert metrics.scheduler_time > 0
-    assert metrics.model_forward_time > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
-    assert attributes.get(
+                              ) == metrics.scheduler_time
-        SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
+        assert metrics.model_forward_time > 0
-            metrics.model_forward_time / 1000)
+        assert attributes.get(
-    assert metrics.model_execute_time > 0
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
+        ) == pytest.approx(metrics.model_forward_time / 1000)
-                          ) == metrics.model_execute_time
+        assert metrics.model_execute_time > 0
-    assert metrics.model_forward_time < 1000 * metrics.model_execute_time
+        assert attributes.get(
            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
        ) == metrics.model_execute_time
        assert metrics.model_forward_time < 1000 * metrics.model_execute_time
--- a/tests/utils.py
+++ b/tests/utils.py
@ -7,12 +7,14 @@ import os
 import signal
 import subprocess
 import sys
 import tempfile
 import time
 import warnings
-from contextlib import contextmanager
+from contextlib import contextmanager, suppress
 from pathlib import Path
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Literal, Optional, Union
 import cloudpickle
 import openai
 import pytest
 import requests
@ -566,6 +568,7 @@ def init_test_distributed_environment(
 def multi_process_parallel(
    monkeypatch: pytest.MonkeyPatch,
    tp_size: int,
    pp_size: int,
    test_target: Any,
@ -582,7 +585,13 @@ def multi_process_parallel(
    refs = []
    for rank in range(tp_size * pp_size):
        refs.append(
-            test_target.remote(tp_size, pp_size, rank, distributed_init_port))
+            test_target.remote(
                monkeypatch,
                tp_size,
                pp_size,
                rank,
                distributed_init_port,
            ), )
    ray.get(refs)
    ray.shutdown()
@ -696,11 +705,83 @@ def fork_new_process_for_each_test(
    return wrapper
 def spawn_new_process_for_each_test(
        f: Callable[_P, None]) -> Callable[_P, None]:
    """Decorator to spawn a new process for each test function.
    """
    @functools.wraps(f)
    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
        # Check if we're already in a subprocess
        if os.environ.get('RUNNING_IN_SUBPROCESS') == '1':
            # If we are, just run the function directly
            return f(*args, **kwargs)
        import torch.multiprocessing as mp
        with suppress(RuntimeError):
            mp.set_start_method('spawn')
        # Get the module
        module_name = f.__module__
        # Create a process with environment variable set
        env = os.environ.copy()
        env['RUNNING_IN_SUBPROCESS'] = '1'
        with tempfile.TemporaryDirectory() as tempdir:
            output_filepath = os.path.join(tempdir, "new_process.tmp")
            # `cloudpickle` allows pickling complex functions directly
            input_bytes = cloudpickle.dumps((f, output_filepath))
            cmd = [sys.executable, "-m", f"{module_name}"]
            returned = subprocess.run(cmd,
                                      input=input_bytes,
                                      capture_output=True,
                                      env=env)
            # check if the subprocess is successful
            try:
                returned.check_returncode()
            except Exception as e:
                # wrap raised exception to provide more information
                raise RuntimeError(f"Error raised in subprocess:\n"
                                   f"{returned.stderr.decode()}") from e
    return wrapper
 def create_new_process_for_each_test(
    method: Optional[Literal["spawn", "fork"]] = None
 ) -> Callable[[Callable[_P, None]], Callable[_P, None]]:
    """Creates a decorator that runs each test function in a new process.
    Args:
        method: The process creation method. Can be either "spawn" or "fork". 
               If not specified,
               it defaults to "spawn" on ROCm platforms and "fork" otherwise.
    Returns:
        A decorator to run test functions in separate processes.
    """
    if method is None:
        method = "spawn" if current_platform.is_rocm() else "fork"
    assert method in ["spawn",
                      "fork"], "Method must be either 'spawn' or 'fork'"
    if method == "fork":
        return fork_new_process_for_each_test
    return spawn_new_process_for_each_test
 def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
    """
    Get a pytest mark, which skips the test if the GPU doesn't meet
    a minimum memory requirement in GB.
-    
+
    This can be leveraged via `@large_gpu_test` to skip tests in environments
    without enough resources, or called when filtering tests to run directly.
    """
@ -755,7 +836,7 @@ def multi_gpu_test(*, num_gpus: int):
    marks = multi_gpu_marks(num_gpus=num_gpus)
    def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
-        func = fork_new_process_for_each_test(f)
+        func = create_new_process_for_each_test()(f)
        for mark in reversed(marks):
            func = mark(func)
--- a/tests/v1/e2e/test_ngram_spec_decode.py
+++ b/tests/v1/e2e/test_ngram_spec_decode.py
@ -1,5 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 import random
 from typing import Any
 import pytest
@ -50,8 +53,12 @@ def model_name():
    return "meta-llama/Meta-Llama-3-8B-Instruct"
-def test_ngram_correctness(monkeypatch, test_prompts, sampling_config,
+def test_ngram_correctness(
-                           model_name):
+    monkeypatch: pytest.MonkeyPatch,
    test_prompts: list[list[dict[str, Any]]],
    sampling_config: SamplingParams,
    model_name: str,
 ):
    '''
    Compare the outputs of a original LLM and a speculative LLM
    should be the same when using ngram speculative decoding.
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@ -80,9 +80,11 @@ async def generate(engine: AsyncLLM,
                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
@pytest.mark.asyncio
-async def test_load(monkeypatch, output_kind: RequestOutputKind,
+async def test_load(
-                    engine_args_and_prompt: tuple[AsyncEngineArgs,
+    monkeypatch: pytest.MonkeyPatch,
-                                                  PromptType]):
+    output_kind: RequestOutputKind,
    engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType],
 ):
    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
    # so that in the future when we switch, we don't have to change all the
    # tests.
@ -126,7 +128,8 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind,
                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
@pytest.mark.asyncio
-async def test_abort(monkeypatch, output_kind: RequestOutputKind,
+async def test_abort(monkeypatch: pytest.MonkeyPatch,
                     output_kind: RequestOutputKind,
                     engine_args_and_prompt: tuple[AsyncEngineArgs,
                                                   PromptType]):
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@ -9,7 +9,6 @@ from concurrent.futures import Future
 import pytest
 from transformers import AutoTokenizer
 from tests.utils import fork_new_process_for_each_test
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
@ -19,6 +18,8 @@ from vllm.v1.executor.abstract import Executor, UniProcExecutor
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.outputs import ModelRunnerOutput
 from ...utils import create_new_process_for_each_test
 if not current_platform.is_cuda():
    pytest.skip(reason="V1 currently only supported on CUDA.",
                allow_module_level=True)
@ -44,8 +45,8 @@ def make_request() -> EngineCoreRequest:
    )
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
-def test_engine_core(monkeypatch):
+def test_engine_core(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
@ -158,11 +159,11 @@ def test_engine_core(monkeypatch):
        assert len(engine_core.scheduler.running) == 0
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
-def test_engine_core_advanced_sampling(monkeypatch):
+def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
    """
-    A basic end-to-end test to verify that the engine functions correctly 
+    A basic end-to-end test to verify that the engine functions correctly
-    when additional sampling parameters, such as top_p, min_tokens, and 
+    when additional sampling parameters, such as top_p, min_tokens, and
    presence_penalty, are set.
    """
    with monkeypatch.context() as m:
@ -208,8 +209,8 @@ def test_engine_core_advanced_sampling(monkeypatch):
        _check_engine_state()
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
-def test_engine_core_concurrent_batches(monkeypatch):
+def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
    """
    Test that the engine can handle multiple concurrent batches.
    """
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@ -8,7 +8,6 @@ from typing import Optional
 import pytest
 from transformers import AutoTokenizer
 from tests.utils import fork_new_process_for_each_test
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
@ -19,6 +18,8 @@ from vllm.v1.engine.core_client import (AsyncMPClient, EngineCoreClient,
                                        SyncMPClient)
 from vllm.v1.executor.abstract import Executor
 from ...utils import create_new_process_for_each_test
 if not current_platform.is_cuda():
    pytest.skip(reason="V1 currently only supported on CUDA.",
                allow_module_level=True)
@ -88,9 +89,10 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
    return msg
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
@pytest.mark.parametrize("multiprocessing_mode", [True, False])
-def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
+def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
                            multiprocessing_mode: bool):
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
@ -175,7 +177,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
@pytest.mark.asyncio(loop_scope="function")
-async def test_engine_core_client_asyncio(monkeypatch):
+async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@ -18,6 +18,9 @@ MODELS_TO_TEST = [
    "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
 ]
 # Undo after https://github.com/vllm-project/vllm/pull/14868
 pytest.skip(allow_module_level=True)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend",
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@ -57,7 +57,7 @@ def _repeat_logprob_config(
    logprob_prompt_logprob_list: BatchLogprobsSpecType,
 ) -> BatchLogprobsSpecType:
    """Ensure each test prompt has a logprob config.
-    
+
    A logprob config specifies the optional (i.e.
    may-be-`None`) number of sample logprobs and
    the optional number of prompt logprobs.
@ -80,7 +80,7 @@ def _repeat_logprob_config(
                            (optional num sample logprob,
                             optional num prompt logprob)
                             tuples
-    
+
    Returns:
      list of
      (optional num sample logprob,optional num prompt logprob)
@ -255,14 +255,12 @@ def _run_and_validate(
                         [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT])
@pytest.mark.parametrize("temperature", [0.0, 2.0])
 def test_get_logprobs_and_prompt_logprobs(
-    hf_model,
+        hf_model, vllm_model,
-    vllm_model,
+        batch_logprobs_composition: BatchLogprobsComposition,
-    batch_logprobs_composition: BatchLogprobsComposition,
+        temperature: float, example_prompts: list[str],
-    temperature: float,
+        monkeypatch: pytest.MonkeyPatch) -> None:
    example_prompts,
 ) -> None:
    """Test V1 Engine logprobs & prompt logprobs
-    
+
    Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
    settings and validate that
    * The generated logprobs and prompt logprobs are consistent with the
@ -279,7 +277,7 @@ def test_get_logprobs_and_prompt_logprobs(
    To save time, only test one APC-enabled scenario
    (sample & prompt logprobs enabled, temperature>0.0).
-    
+
    Args:
      hf_model: HuggingFace reference model fixture
      vllm_model: vLLM model fixture
@ -287,128 +285,140 @@ def test_get_logprobs_and_prompt_logprobs(
      temperature: "temperature" sampling parameter
      example_prompts: example prompt fixture
    """
-    do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
+    with monkeypatch.context() as m:
-    if do_apc and (temperature < 2.0
+        m.setenv("VLLM_USE_V1", "1")
-                   or batch_logprobs_composition != SAMPLE_PROMPT):
+        do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
-        # Skip some test-cases to save time.
+        if do_apc and (temperature < 2.0
-        pytest.skip()
+                       or batch_logprobs_composition != SAMPLE_PROMPT):
-    test_prompts = example_prompts
+            # Skip some test-cases to save time.
            pytest.skip()
        test_prompts = example_prompts
-    max_tokens = 5
+        max_tokens = 5
-    hf_outputs = hf_model.generate_greedy(
+        hf_outputs = hf_model.generate_greedy(
-        test_prompts,
+            test_prompts,
        max_tokens=max_tokens,
    )
    hf_logprobs = hf_model.generate_greedy_logprobs(
        test_prompts,
        max_tokens=max_tokens,
    )
    # Batch has mixed sample params
    # (different logprobs/prompt logprobs combos)
    logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
    # Ensure that each test prompt has a logprob config for testing
    logprob_prompt_logprob_list = _repeat_logprob_config(
        test_prompts, logprob_prompt_logprob_list)
    # Generate SamplingParams
    vllm_sampling_params = [
        SamplingParams(max_tokens=max_tokens,
                       logprobs=num_lp,
                       prompt_logprobs=num_plp,
                       temperature=temperature,
                       seed=1984)
        for num_lp, num_plp in logprob_prompt_logprob_list
    ]
    for _ in range(2 if do_apc else 1):
        _run_and_validate(
            vllm_model=vllm_model,
            test_prompts=test_prompts,
            vllm_sampling_params=vllm_sampling_params,
            hf_logprobs=hf_logprobs,
            hf_outputs=hf_outputs,
            logprob_prompt_logprob_list=logprob_prompt_logprob_list,
            temperature=temperature,
            max_tokens=max_tokens,
-            do_apc=do_apc)
+        )
        hf_logprobs = hf_model.generate_greedy_logprobs(
            test_prompts,
            max_tokens=max_tokens,
        )
        # Batch has mixed sample params
        # (different logprobs/prompt logprobs combos)
        logprob_prompt_logprob_list = get_test_batch(
            batch_logprobs_composition)
        # Ensure that each test prompt has a logprob config for testing
        logprob_prompt_logprob_list = _repeat_logprob_config(
            test_prompts, logprob_prompt_logprob_list)
        # Generate SamplingParams
        vllm_sampling_params = [
            SamplingParams(max_tokens=max_tokens,
                           logprobs=num_lp,
                           prompt_logprobs=num_plp,
                           temperature=temperature,
                           seed=1984)
            for num_lp, num_plp in logprob_prompt_logprob_list
        ]
        for _ in range(2 if do_apc else 1):
            _run_and_validate(
                vllm_model=vllm_model,
                test_prompts=test_prompts,
                vllm_sampling_params=vllm_sampling_params,
                hf_logprobs=hf_logprobs,
                hf_outputs=hf_outputs,
                logprob_prompt_logprob_list=logprob_prompt_logprob_list,
                temperature=temperature,
                max_tokens=max_tokens,
                do_apc=do_apc)
-def test_max_logprobs():
+def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
    """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
    Should also fail for `prompt_logprobs > max_logprobs`
    APC should not matter as this test checks basic request validation.
    Args:
      monkeypatch
    """
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
-    runner = VllmRunner("facebook/opt-125m",
+        runner = VllmRunner("facebook/opt-125m",
-                        max_logprobs=1,
+                            max_logprobs=1,
-                        enable_prefix_caching=False,
+                            enable_prefix_caching=False,
-                        max_model_len=256)
+                            max_model_len=256)
-    vllm_sampling_params = SamplingParams(logprobs=1)
+        vllm_sampling_params = SamplingParams(logprobs=1)
-    # should pass
+        # should pass
-    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+        runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
-    bad_sampling_params = SamplingParams(logprobs=2)
+        bad_sampling_params = SamplingParams(logprobs=2)
-    with pytest.raises(ValueError):
+        with pytest.raises(ValueError):
-        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+            runner.generate(["Hello world"],
                            sampling_params=bad_sampling_params)
-def test_none_logprobs(vllm_model, example_prompts):
+def test_none_logprobs(vllm_model, example_prompts,
                       monkeypatch: pytest.MonkeyPatch):
    """Engine should return `logprobs` and `prompt_logprobs` as `None`
-    
+
    Args:
      vllm_model: vLLM model fixture
      example_prompts: list of example prompts (test fixture)
    """
-    max_tokens = 5
+    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
        max_tokens = 5
-    sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
+        sampling_params_logprobs_none = SamplingParams(
-                                                   logprobs=None,
+            max_tokens=max_tokens,
-                                                   prompt_logprobs=None,
+            logprobs=None,
-                                                   temperature=0.0)
+            prompt_logprobs=None,
-    results_logprobs_none = vllm_model.model.generate(
+            temperature=0.0,
-        example_prompts, sampling_params=sampling_params_logprobs_none)
+        )
        results_logprobs_none = vllm_model.model.generate(
            example_prompts,
            sampling_params=sampling_params_logprobs_none,
        )
-    for i in range(len(results_logprobs_none)):
+        for i in range(len(results_logprobs_none)):
-        # Check sample logprobs are None
+            # Check sample logprobs are None
-        assert results_logprobs_none[i].outputs[0].logprobs is None
+            assert results_logprobs_none[i].outputs[0].logprobs is None
-        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
+            assert results_logprobs_none[i].outputs[
-        # Check prompt logprobs are None
+                0].cumulative_logprob is None
-        assert results_logprobs_none[i].prompt_logprobs is None
+            # Check prompt logprobs are None
            assert results_logprobs_none[i].prompt_logprobs is None
-def test_zero_logprobs(vllm_model, example_prompts):
+def test_zero_logprobs(vllm_model, example_prompts,
                       monkeypatch: pytest.MonkeyPatch):
    """Engine should return sampled token and prompt token logprobs
-    
+
    Args:
      vllm_model: vLLM model fixture
      example_prompts: list of example prompts (test fixture)
    """
-    max_tokens = 5
+    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
        max_tokens = 5
-    sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
+        sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
-                                                   logprobs=0,
+                                                       logprobs=0,
-                                                   prompt_logprobs=0,
+                                                       prompt_logprobs=0,
-                                                   temperature=0.0)
+                                                       temperature=0.0)
-    results_logprobs_zero = vllm_model.model.generate(
+        results_logprobs_zero = vllm_model.model.generate(
-        example_prompts, sampling_params=sampling_params_logprobs_zero)
+            example_prompts, sampling_params=sampling_params_logprobs_zero)
-    for i in range(len(results_logprobs_zero)):
+        for i in range(len(results_logprobs_zero)):
-        # Check that there is one sample logprob dict for each
+            # Check that there is one sample logprob dict for each
-        # sample token
+            # sample token
-        logprobs = results_logprobs_zero[i].outputs[0].logprobs
+            logprobs = results_logprobs_zero[i].outputs[0].logprobs
-        prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
+            prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
-        sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
+            sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
-        prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
+            prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
-        assert logprobs is not None
+            assert logprobs is not None
-        assert len(sampled_token_ids) == len(logprobs)
+            assert len(sampled_token_ids) == len(logprobs)
-        assert results_logprobs_zero[i].outputs[
+            assert results_logprobs_zero[i].outputs[
-            0].cumulative_logprob is not None
+                0].cumulative_logprob is not None
-        # Check that there is one prompt logprob dict for each
+            # Check that there is one prompt logprob dict for each
-        # prompt token
+            # prompt token
-        assert prompt_logprobs is not None
+            assert prompt_logprobs is not None
-        assert len(prompt_token_ids) == len(prompt_logprobs)
+            assert len(prompt_token_ids) == len(prompt_logprobs)
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@ -1,37 +1,51 @@
 # SPDX-License-Identifier: Apache-2.0
 from typing import Any, Optional
 import pytest
 import torch
 import torch.nn.functional as F
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
 DEVICE = "cpu"
@pytest.fixture
 def sampler():
    return RejectionSampler()
-def create_logits_tensor(token_ids: list[int],
+def create_logits_tensor(token_ids: list[list[int]],
                         vocab_size: int = 100) -> torch.Tensor:
    """Helper function to create logits tensor that 
       will produce desired token ids on argmax"""
-    logits = torch.full((len(token_ids), vocab_size), -100.0).cuda()
+    num_total_tokens = sum(len(tokens) for tokens in token_ids)
-    for i, token_id in enumerate(token_ids):
+    logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE)
-        logits[i, token_id] = 100.0
+    start_loc = 0
    for tokens in token_ids:
        for j, token_id in enumerate(tokens):
            logits[start_loc + j, token_id] = 100.0
        start_loc += len(tokens)
    return logits
-def create_sampling_metadata(spec_tokens: list[list[int]]) -> SamplingMetadata:
+def create_sampling_metadata(
-    batch_size = len(spec_tokens)
+        all_greedy: bool,
        generators: Optional[dict[int, Any]] = None) -> SamplingMetadata:
    """Create a v1 sampling metadata object with all_greedy set 
        to the given value. Either all greedy or all random sampling 
        is used.
    """
    generators = generators or {}
    return SamplingMetadata(
        temperature=torch.tensor([]),
-        all_greedy=True,
+        all_greedy=all_greedy,
-        all_random=False,
+        all_random=not all_greedy,
        top_p=None,
        top_k=None,
-        min_p=torch.empty(batch_size, ),
+        min_p=torch.empty(1, ),
-        generators={},
+        generators=generators,
        max_num_logprobs=0,
        no_penalties=False,
        prompt_token_ids=None,
@ -40,129 +54,310 @@ def create_sampling_metadata(spec_tokens: list[list[int]]) -> SamplingMetadata:
        repetition_penalties=torch.tensor([]),
        output_token_ids=[],
        min_tokens={},
-        logit_bias=[None] * batch_size,
+        logit_bias=[None],
        allowed_token_ids_mask=None,
        bad_words_token_ids={},
    )
 ########################### Tests for Greedy Sampling ###################
 def test_perfect_match(sampler):
    """Test when output tokens perfectly match speculated tokens"""
    spec_tokens = [[1, 2, 3]]
-    output_tokens = [1, 2, 3, 4]  # 4 is the bonus token
+    output_tokens = [[1, 2, 3, 4]]  # 4 is the bonus token
-    metadata = create_sampling_metadata(spec_tokens)
+    metadata = create_sampling_metadata(all_greedy=True)
    logits = create_logits_tensor(output_tokens)
    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
                                      device=logits.device)
-    output = sampler(spec_tokens, logits, metadata)
+    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
    expected = torch.tensor([[1, 2, 3, 4]],
                            dtype=torch.int,
                            device=logits.device)
-    assert torch.equal(output.sampled_token_ids, expected)
+    assert torch.equal(output, expected)
 def test_early_mismatch(sampler):
    """Test when there's an early mismatch in tokens"""
    spec_tokens = [[1, 2, 3]]
-    output_tokens = [1, 5, 3, 4]  # Mismatch at position 1
+    output_tokens = [[1, 5, 3, 4]]  # Mismatch at position 1
-    metadata = create_sampling_metadata(spec_tokens)
+    metadata = create_sampling_metadata(all_greedy=True)
    logits = create_logits_tensor(output_tokens)
    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
                                      device=logits.device)
-    output = sampler(spec_tokens, logits, metadata)
+    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
    expected = torch.tensor([[1, 5, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
                            dtype=torch.int,
                            device=logits.device)
-    assert torch.equal(output.sampled_token_ids, expected)
+    assert torch.equal(output, expected)
 def test_multiple_sequences(sampler):
    """Test handling multiple sequences of speculated tokens"""
    spec_tokens = [[1, 2], [3]]
-    output_tokens = [1, 2, 5, 3, 4]  # Two sequences with bonus tokens 5 and 4
+    output_tokens = [[1, 2, 5], [3,
                                 4]]  # Two sequences with bonus tokens 5 and 4
-    metadata = create_sampling_metadata(spec_tokens)
+    metadata = create_sampling_metadata(all_greedy=True)
    logits = create_logits_tensor(output_tokens)
    bonus_token_tensor = torch.tensor(
        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
-    output = sampler(spec_tokens, logits, metadata)
+    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
    expected = torch.tensor([[1, 2, 5], [3, 4, INVALID_TOKEN_ID]],
                            dtype=torch.int,
                            device=logits.device)
-    assert torch.equal(output.sampled_token_ids, expected)
+    assert torch.equal(output, expected)
 def test_single_token_sequence(sampler):
    """Test handling sequences with single token"""
    spec_tokens = [[1]]
-    output_tokens = [1, 2]  # Single token with bonus token 2
+    output_tokens = [[1, 2]]  # Single token with bonus token 2
-    metadata = create_sampling_metadata(spec_tokens)
+    metadata = create_sampling_metadata(all_greedy=True)
    logits = create_logits_tensor(output_tokens)
    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
                                      device=logits.device)
-    output = sampler(spec_tokens, logits, metadata)
+    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
    expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device)
-    assert torch.equal(output.sampled_token_ids, expected)
+    assert torch.equal(output, expected)
 def test_empty_sequence(sampler):
    """Test handling empty sequence of speculated tokens"""
    spec_tokens: list[list[int]] = [[]]
-    output_tokens = [5]  # Just the bonus token
+    output_tokens = [[5]]  # Just the bonus token
-    metadata = create_sampling_metadata(spec_tokens)
+    metadata = create_sampling_metadata(all_greedy=True)
    logits = create_logits_tensor(output_tokens)
    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
                                      device=logits.device)
-    output = sampler(spec_tokens, logits, metadata)
+    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
    expected = torch.tensor([[5]], dtype=torch.int, device=logits.device)
-    assert torch.equal(output.sampled_token_ids, expected)
+    assert torch.equal(output, expected)
 def test_multiple_mismatches(sampler):
    """Test handling multiple sequences with mismatches"""
    spec_tokens = [[1, 2, 3], [4, 5, 6]]
-    output_tokens = [1, 2, 7, 6, 4, 8, 6, 9]  # Mismatches in both sequences
+    output_tokens = [[1, 2, 7, 6], [4, 8, 6,
                                    9]]  # Mismatches in both sequences
-    metadata = create_sampling_metadata(spec_tokens)
+    metadata = create_sampling_metadata(all_greedy=True)
    logits = create_logits_tensor(output_tokens)
    bonus_token_tensor = torch.tensor(
        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
-    output = sampler(spec_tokens, logits, metadata)
+    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
    expected = torch.tensor([[1, 2, 7, INVALID_TOKEN_ID],
                             [4, 8, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
                            dtype=torch.int,
                            device=logits.device)
-    assert torch.equal(output.sampled_token_ids, expected)
+    assert torch.equal(output, expected)
@pytest.mark.parametrize(
    "spec_tokens,output_tokens,expected",
    [
-        ([[1, 2]], [1, 2, 3], [[1, 2, 3]]),  # Perfect match with bonus
+        ([[1, 2]], [[1, 2, 3]], [[1, 2, 3]]),  # Perfect match with bonus
-        ([[1]], [2, 3], [[2, INVALID_TOKEN_ID]]),  # First mismatch
+        ([[1]], [[2, 3]], [[2, INVALID_TOKEN_ID]]),  # First mismatch
-        ([[1, 2], [3, 4]], [1, 5, 6, 3, 4, 7], [[1, 5, INVALID_TOKEN_ID],
+        ([[1, 2], [3, 4]], [[1, 5, 6], [3, 4, 7]],
-                                                [3, 4, 7]]),  # Mixed matches
+         [[1, 5, INVALID_TOKEN_ID], [3, 4, 7]]),  # Mixed matches
    ])
 def test_parametrized_cases(sampler, spec_tokens, output_tokens, expected):
    """Parametrized test for various matching scenarios"""
-    metadata = create_sampling_metadata(spec_tokens)
+    metadata = create_sampling_metadata(all_greedy=True)
    logits = create_logits_tensor(output_tokens)
    bonus_token_tensor = torch.tensor([tokens[-1] for tokens in output_tokens],
                                      device=logits.device)
-    output = sampler(spec_tokens, logits, metadata)
+    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
    expected_tensor = torch.tensor(expected,
                                   dtype=torch.int,
                                   device=logits.device)
-    assert torch.equal(output.sampled_token_ids, expected_tensor)
+    assert torch.equal(output, expected_tensor)
-def test_logits_shape_handling(sampler):
+########################### Tests for Random Sampling ###################
-    """Test handling of different logits tensor shapes"""
+@pytest.mark.parametrize("k", [1, 3, 5])
-    spec_tokens = [[1, 2]]
+@pytest.mark.parametrize("vocab_size", [1000])
-    output_tokens = [1, 2, 3]
+@pytest.mark.parametrize("batch_size", [1, 4, 8])
-    vocab_size = 1000
+@pytest.mark.parametrize("frac_seeded", [0.0, 0.5])
@pytest.mark.parametrize("n_rep", [20])
 def test_deterministic_when_seeded(sampler, k: int, vocab_size: int,
                                   batch_size: int, frac_seeded: float,
                                   n_rep: int):
    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
    target_probs = torch.rand(batch_size * (k + 1),
                              vocab_size,
                              dtype=torch.float32)
    bonus_token_ids = torch.randint(low=0,
                                    high=vocab_size,
                                    size=(batch_size, 1),
                                    dtype=torch.int64)
    draft_token_ids = torch.randint(low=0,
                                    high=vocab_size,
                                    size=(batch_size, k),
                                    dtype=torch.int64)
-    metadata = create_sampling_metadata(spec_tokens)
+    seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
    logits = create_logits_tensor(output_tokens, vocab_size)
-    output = sampler(spec_tokens, logits, metadata)
+    results = []
-    expected = torch.tensor([[1, 2, 3]], dtype=torch.int, device=logits.device)
+    for _ in range(n_rep):
-    assert torch.equal(output.sampled_token_ids, expected)
+        seeded_seqs = {
-    assert logits.shape[-1] == vocab_size
+            i: torch.Generator(device=DEVICE).manual_seed(i)
            for i in range(batch_size) if seeded_mask[i]
        }
        sampling_metadata = create_sampling_metadata(all_greedy=False,
                                                     generators=seeded_seqs)
        rep_result = sampler(draft_token_ids.tolist(), draft_probs,
                             bonus_token_ids, target_probs, sampling_metadata)
        results.append(rep_result)
    for i in range(batch_size):
        if seeded_mask[i]:
            for j in range(1, n_rep):
                assert torch.equal(results[j][i], results[0][i])
 def test_rejection_sampling_approximates_target_distribution():
    """Verify rejection sampling approximates target distribution,
    despite sampling from a potentially distinct draft distribution.
    This is done by first creating a random target probability
    distribution and a random draft probability distribution. We then
    sample token ids from the rejection sampler using these draft
    and target distributions. The samples are used to estimate
    the output probability distribution, which we expect to approximate
    the target distribution.
    A basic distance metric is used to determine similarity between
    distributions.
    We expect that as we increase the number of samples,
    the distance between the observed distribution and the target
    distribution decreases. To measure this, we compare the distance
    of the observed distribution against both the target distribution
    and a uniform random distribution. We expect the distance between
    the observed distribution and the target distribution to improve
    much more than the distance improvement between the observed
    distribution and the random distribution.
    """
    torch.set_default_device(DEVICE)
    vocab_size = 10
    k = 2
    num_reference_probs = 100
    # Prepare draft, target, and reference probability distributions
    draft_probs, target_probs = (F.softmax(
        torch.rand(vocab_size, dtype=torch.float32),
        dim=-1,
    ) for _ in range(2))
    reference_probs = F.softmax(
        torch.rand(num_reference_probs, vocab_size, dtype=torch.float32),
        dim=-1,
    )
    sample_sizes = [10, 100, 1_000, 10_000, 100_000]
    distance_wrt_reference: list[float] = []
    distance_wrt_target: list[float] = []
    for num_samples in sample_sizes:
        # Sample using rejection sampling.
        rej_sample_probs = estimate_rejection_sampling_pdf(
            draft_probs, target_probs, k, vocab_size, num_samples)
        rej_sample_probs = rej_sample_probs.to(DEVICE)
        # Average distance from reference probs.
        reference_vs_rejsample_dist = torch.dist(
            reference_probs,
            rej_sample_probs).item() / reference_probs.shape[0]
        target_vs_rejsample_dist = torch.dist(target_probs,
                                              rej_sample_probs).item()
        distance_wrt_reference.append(reference_vs_rejsample_dist)
        distance_wrt_target.append(target_vs_rejsample_dist)
        relative_change_in_distance_wrt_target = get_ratio_first_to_last(
            distance_wrt_target)
        relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
            distance_wrt_reference)
        print(f"{num_samples=} {target_vs_rejsample_dist=:.05f} "
              f"{reference_vs_rejsample_dist=:.05f}")
        print(f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} "
              f"{relative_change_in_distance_wrt_reference=:.02f}")
    relative_change_in_distance_wrt_target = get_ratio_first_to_last(
        distance_wrt_target)
    relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
        distance_wrt_reference)
    expected_improvement_multiplier = 20
    assert (relative_change_in_distance_wrt_target
            > relative_change_in_distance_wrt_reference *
            expected_improvement_multiplier)
 def get_ratio_first_to_last(elements: list[float]) -> float:
    return elements[0] / elements[-1]
 def estimate_rejection_sampling_pdf(
    draft_probs: torch.Tensor,
    target_probs: torch.Tensor,
    k: int,
    vocab_size: int,
    num_samples: int,
 ) -> torch.Tensor:
    """Estimate the probability distribution of the output tokens
    using rejection sampling.
    Args:
        draft_probs: Draft probability distribution.
        target_probs: Target probability distribution.
        num_samples: Number of samples to draw.
    Returns:
        Estimated probability distribution of the output tokens.
    """
    sampler = RejectionSampler()
    # Repeat draft probs num_samples times.
    draft_probs = draft_probs.reshape(1, 1,
                                      vocab_size).repeat(num_samples, k, 1)
    # Repeat target probs num_samples * (k + 1) times.
    target_probs = target_probs.reshape(1, 1, vocab_size).repeat(
        num_samples, k + 1, 1).reshape(num_samples * (k + 1), vocab_size)
    # Randomly sample draft token ids from draft probs.
    draft_token_ids = torch.multinomial(draft_probs[:, 0, :],
                                        num_samples=k,
                                        replacement=True).reshape(
                                            num_samples, k)
    # Bonus tokens not used but required.
    bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64,
                                  device=DEVICE).repeat(num_samples, 1)
    sampling_metadata = create_sampling_metadata(all_greedy=False)
    output_token_ids = sampler(draft_token_ids.tolist(), draft_probs,
                               bonus_token_ids, target_probs,
                               sampling_metadata)
    output_token_ids = output_token_ids[:, :-1].flatten()
    hist = torch.histogram(output_token_ids.to(dtype=torch.float,
                                               device="cpu"),
                           bins=vocab_size,
                           range=(0, vocab_size),
                           density=True)
    return hist.hist
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@ -1,32 +1,37 @@
 # SPDX-License-Identifier: Apache-2.0
 import pytest
-from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+import numpy as np
-from vllm.v1.utils import ConstantList
+
 from vllm.v1.spec_decode.ngram_proposer import (_find_subarray_kmp,
                                                _kmp_lps_array)
-@pytest.fixture
+def test_kmp_lps_array():
-def proposer():
+    np.testing.assert_array_equal(_kmp_lps_array(np.array([])), np.array([]))
-    return NgramProposer()
+    np.testing.assert_array_equal(_kmp_lps_array(np.array([1])), np.array([0]))
    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 1, 1])),
                                  np.array([0, 1, 2]))
    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 3, 4])),
                                  np.array([0, 0, 0, 0]))
    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 1, 2, 3])),
                                  np.array([0, 0, 1, 2, 0]))
-def test_kmp_lps_array(proposer):
+def test_find_subarray_kmp():
-    assert proposer._kmp_lps_array([]) == []
+    X = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
-    assert proposer._kmp_lps_array([1]) == [0]
+    assert _find_subarray_kmp(X, 2, 2) is None
-    assert proposer._kmp_lps_array([1, 1, 1]) == [0, 1, 2]
+    X = np.array([1, 2, 3, 4, 1, 2, 3])
-    assert proposer._kmp_lps_array([1, 2, 3, 4]) == [0, 0, 0, 0]
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
-    assert proposer._kmp_lps_array([1, 2, 1, 2, 3]) == [0, 0, 1, 2, 0]
+                                  np.array([4, 1, 2]))
-
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 2), np.array([4,
-
+                                                                         1]))
-def test_find_subarray_kmp(proposer):
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
-    X = ConstantList([1, 2, 3, 4, 1, 2, 3, 5, 6])
+                                  np.array([4, 1, 2]))
-    assert proposer._find_subarray_kmp(X, 2, 2) is None
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 2), np.array([4,
-    X = ConstantList([1, 2, 3, 4, 1, 2, 3])
+                                                                         1]))
-    assert proposer._find_subarray_kmp(X, 2, 3) == [4, 1, 2]
+    X = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
-    assert proposer._find_subarray_kmp(X, 2, 2) == [4, 1]
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
-    assert proposer._find_subarray_kmp(X, 1, 3) == [4, 1, 2]
+                                  np.array([4, 1, 2]))
    assert proposer._find_subarray_kmp(X, 1, 2) == [4, 1]
    X = ConstantList([1, 3, 6, 2, 3, 4, 1, 2, 3])
    assert proposer._find_subarray_kmp(X, 2, 3) == [4, 1, 2]
    # Return on the first match
-    assert proposer._find_subarray_kmp(X, 1, 3) == [6, 2, 3]
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
                                  np.array([6, 2, 3]))
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@ -3,11 +3,16 @@
 Run `pytest tests/v1/tpu/test_basic.py`.
 """
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import pytest
 from vllm.platforms import current_platform
-from ...conftest import VllmRunner
+if TYPE_CHECKING:
    from tests.conftest import VllmRunner
 MODELS = [
    # "Qwen/Qwen2-7B-Instruct",
@ -28,7 +33,8 @@ TENSOR_PARALLEL_SIZES = [1]
@pytest.mark.parametrize("enforce_eager", [True])
@pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
 def test_models(
-    monkeypatch,
+    vllm_runner: type[VllmRunner],
    monkeypatch: pytest.MonkeyPatch,
    model: str,
    max_tokens: int,
    enforce_eager: bool,
@ -41,7 +47,7 @@ def test_models(
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
-        with VllmRunner(
+        with vllm_runner(
                model,
                max_model_len=8192,
                enforce_eager=enforce_eager,
@ -50,5 +56,5 @@ def test_models(
                tensor_parallel_size=tensor_parallel_size) as vllm_model:
            vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                      max_tokens)
-    output = vllm_outputs[0][1]
+        output = vllm_outputs[0][1]
-    assert "1024" in output
+        assert "1024" in output
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@ -124,8 +124,9 @@ def _construct_expected_sampling_metadata(
        if req.sampling_params.allowed_token_ids:
            allowed_token_ids_mask[index_in_input_batch][
                req.sampling_params.allowed_token_ids] = True
-        bad_words_token_ids[
+        if req.sampling_params.bad_words_token_ids:
-            index_in_input_batch] = req.sampling_params.bad_words_token_ids
+            bad_words_token_ids[
                index_in_input_batch] = req.sampling_params.bad_words_token_ids
    return SamplingMetadata(
        temperature=torch.tensor(temperature, dtype=torch.float,
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@ -299,13 +299,10 @@ def stateless_init_torch_distributed_process_group(
    # different systems (e.g. RPC) in case the store is multi-tenant.
    prefix_store = PrefixStore(init_method, store)
    pg_options = ProcessGroup.Options(backend=backend, timeout=timeout)
    pg: ProcessGroup = ProcessGroup(
        prefix_store,
        group_rank,
        group_size,
        pg_options,
    )
    if backend == "gloo":
@ -327,7 +324,10 @@ def stateless_init_torch_distributed_process_group(
                                         backend_options)
        backend_type = ProcessGroup.BackendType.NCCL
        device = torch.device("cuda")
    else:
        raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
    pg._set_default_backend(backend_type)
    backend_class._set_sequence_number_for_group()
    pg._register_backend(device, backend_type, backend_class)
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -1487,13 +1487,6 @@ class EngineArgs:
                               recommend_to_remove=False)
            return False
        # No MistralTokenizer support so far (not compatible
        # with xgrammar)
        if model_config.tokenizer_mode == "mistral":
            _raise_or_fallback(feature_name="--tokenizer-mode mistral",
                               recommend_to_remove=False)
            return False
        # No CPU offloading yet.
        if self.cpu_offload_gb != EngineArgs.cpu_offload_gb:
            _raise_or_fallback(feature_name="--cpu-offload-gb",
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@ -379,6 +379,7 @@ class InputPreprocessor:
                    multi_modal_data,
                    mm_processor_kwargs,
                    lora_request=lora_request,
                    return_mm_hashes=return_mm_hashes,
                )
            prompt_token_ids = self._tokenize_prompt(
@ -401,6 +402,7 @@ class InputPreprocessor:
        prompt: SingletonPrompt,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
        return_mm_hashes: bool = False,
    ) -> SingletonInputs:
        """Async version of :meth:`_extract_prompt_components`."""
        parsed = parse_singleton_prompt(prompt)
@ -431,6 +433,7 @@ class InputPreprocessor:
                    multi_modal_data,
                    mm_processor_kwargs,
                    lora_request=lora_request,
                    return_mm_hashes=return_mm_hashes,
                )
            return token_inputs(
@ -452,6 +455,7 @@ class InputPreprocessor:
                    multi_modal_data,
                    mm_processor_kwargs,
                    lora_request=lora_request,
                    return_mm_hashes=return_mm_hashes,
                )
            prompt_token_ids = await self._tokenize_prompt_async(
@ -726,6 +730,7 @@ class InputPreprocessor:
            prompt,
            request_id=request_id,
            lora_request=lora_request,
            return_mm_hashes=return_mm_hashes,
        )
        return self._build_decoder_only_llm_inputs(
@ -746,6 +751,7 @@ class InputPreprocessor:
            prompt,
            request_id=request_id,
            lora_request=lora_request,
            return_mm_hashes=return_mm_hashes,
        )
        return self._build_decoder_only_llm_inputs(
--- a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 4
    },
    "2": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 5
    },
    "4": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 5
    },
    "8": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 2
    },
    "16": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 5
    },
    "24": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 2
    },
    "32": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 256,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 2
    },
    "48": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 2
    },
    "64": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "96": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "128": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "256": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "512": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 2
    },
    "1024": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 4
    },
    "1536": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 3
    },
    "2048": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "3072": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "4096": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 4
    }
 }
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 4
    },
    "2": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 4
    },
    "4": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "8": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "16": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 4
    },
    "24": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "32": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 2
    },
    "48": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 2
    },
    "64": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 2
    },
    "96": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 4
    },
    "128": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 5
    },
    "256": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 4
    },
    "512": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "1024": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "1536": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "2048": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "3072": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "4096": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    }
 }
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "2": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "4": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "8": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "16": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "24": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "32": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 2
    },
    "48": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 2
    },
    "64": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 2
    },
    "96": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 2
    },
    "128": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 2
    },
    "256": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 2
    },
    "512": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "1024": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "1536": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "2048": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "3072": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "4096": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    }
 }
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "2": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "4": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "8": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "16": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "24": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 2
    },
    "32": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 2
    },
    "48": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 2
    },
    "64": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "96": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "128": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "256": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "512": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 2
    },
    "1024": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "1536": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "2048": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "3072": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "4096": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    }
 }
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "2": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "4": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "8": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "16": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "24": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "32": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 2
    },
    "48": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 2
    },
    "64": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 2
    },
    "96": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 2
    },
    "128": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 2
    },
    "256": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 2
    },
    "512": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "1024": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "1536": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "2048": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "3072": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "4096": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    }
 }
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "2": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "4": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "8": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "16": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "24": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "32": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "48": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "64": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "96": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "128": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "256": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "512": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "1024": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "1536": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "2048": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "3072": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "4096": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    }
 }
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 4
    },
    "2": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 4
    },
    "4": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 8,
        "num_stages": 4
    },
    "8": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 8,
        "num_stages": 4
    },
    "16": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 5
    },
    "24": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "32": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "48": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "64": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "96": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 4
    },
    "128": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 4
    },
    "256": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 4
    },
    "512": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "1024": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "1536": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "2048": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "3072": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "4096": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    }
 }
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "2": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "4": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "8": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "16": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "24": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "32": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "48": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "64": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "96": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "128": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "256": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "512": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "1024": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "1536": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "2048": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "3072": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 4
    },
    "4096": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    }
 }
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "2": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 4
    },
    "4": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 4
    },
    "8": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "16": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "24": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "32": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "48": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "64": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "96": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "128": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "256": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "512": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "1024": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 2
    },
    "1536": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 2
    },
    "2048": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 2
    },
    "3072": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "4096": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 2
    }
 }
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 32,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 5
    },
    "2": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 64,
        "num_warps": 8,
        "num_stages": 5
    },
    "4": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 4
    },
    "8": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "16": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "24": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "32": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "48": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "64": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 256,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 2
    },
    "96": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "128": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "256": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 3
    },
    "512": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 3
    },
    "1024": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 3
    },
    "1536": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "2048": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "3072": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "4096": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    }
 }
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 4
    },
    "2": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 4
    },
    "4": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "8": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 8,
        "num_stages": 4
    },
    "16": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 8,
        "num_stages": 3
    },
    "24": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "32": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "48": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "64": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "96": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "128": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "256": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "512": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "1024": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "1536": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "2048": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "3072": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "4096": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 8,
        "num_stages": 4
    }
 }
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 4
    },
    "2": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "4": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "8": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "16": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "24": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 4
    },
    "32": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 5
    },
    "48": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 4
    },
    "64": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 4
    },
    "96": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 4
    },
    "128": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 4
    },
    "256": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 4
    },
    "512": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "1024": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "1536": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 4
    },
    "2048": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 4
    },
    "3072": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "4096": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 4
    }
 }
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 4
    },
    "2": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 2
    },
    "4": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 5
    },
    "8": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 3
    },
    "16": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 2
    },
    "24": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 256,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 2
    },
    "32": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 4
    },
    "48": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 8,
        "num_stages": 3
    },
    "64": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 3
    },
    "96": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "128": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "256": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "512": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "1024": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "1536": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "2048": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "3072": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 4
    },
    "4096": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 4
    }
 }
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "2": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 256,
        "GROUP_SIZE_M": 64,
        "num_warps": 8,
        "num_stages": 4
    },
    "4": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 5
    },
    "8": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 4
    },
    "16": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "24": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 256,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "32": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 256,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 4
    },
    "48": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 256,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "64": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "96": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "128": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "256": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "512": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 3
    },
    "1024": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "1536": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 4
    },
    "2048": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "3072": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 4
    },
    "4096": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 4
    }
 }
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "2": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 5
    },
    "4": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 5
    },
    "8": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 5
    },
    "16": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 64,
        "num_warps": 4,
        "num_stages": 3
    },
    "24": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 4
    },
    "32": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 5
    },
    "48": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 5
    },
    "64": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "96": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "128": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "256": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "512": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "1024": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "1536": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "2048": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 4
    },
    "3072": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "4096": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    }
 }
--- a/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json
@ -0,0 +1,146 @@
 {
    "1": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 2
    },
    "2": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 32,
        "num_warps": 4,
        "num_stages": 5
    },
    "4": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 5
    },
    "8": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 256,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "16": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 5
    },
    "24": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 256,
        "GROUP_SIZE_M": 16,
        "num_warps": 4,
        "num_stages": 3
    },
    "32": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 5
    },
    "48": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 256,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 2
    },
    "64": {
        "BLOCK_SIZE_M": 16,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 3
    },
    "96": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "128": {
        "BLOCK_SIZE_M": 32,
        "BLOCK_SIZE_N": 128,
        "BLOCK_SIZE_K": 128,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "256": {
        "BLOCK_SIZE_M": 64,
        "BLOCK_SIZE_N": 64,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 4,
        "num_stages": 3
    },
    "512": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "1024": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 4
    },
    "1536": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 4
    },
    "2048": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "3072": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 1,
        "num_warps": 8,
        "num_stages": 4
    },
    "4096": {
        "BLOCK_SIZE_M": 128,
        "BLOCK_SIZE_N": 256,
        "BLOCK_SIZE_K": 64,
        "GROUP_SIZE_M": 16,
        "num_warps": 8,
        "num_stages": 4
    }
 }
--- a/Show More
+++ b/Show More