From a73e183e36a818ea95f442ae1751bc66cf4f135d Mon Sep 17 00:00:00 2001 From: Sibi <85477603+t-sibiraj@users.noreply.github.com> Date: Mon, 17 Mar 2025 11:35:57 +0800 Subject: [PATCH] [Misc] Replace os environ to monkeypatch in test suite (#14516) Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com> Signed-off-by: Aaron Pham Co-authored-by: Cyrus Leung Co-authored-by: Aaron Pham --- .buildkite/test-pipeline.yaml | 2 +- .../test_basic_correctness.py | 105 +++--- .../basic_correctness/test_chunked_prefill.py | 168 +++++----- tests/basic_correctness/test_cumem.py | 62 ++-- tests/compile/test_basic_correctness.py | 207 ++++++------ tests/compile/test_full_graph.py | 115 ++++++- tests/compile/utils.py | 93 ------ tests/conftest.py | 2 +- tests/distributed/test_comm_ops.py | 85 +++-- tests/distributed/test_custom_all_reduce.py | 173 +++++----- tests/distributed/test_pipeline_partition.py | 60 ++-- tests/distributed/test_pp_cudagraph.py | 38 ++- tests/entrypoints/llm/test_accuracy.py | 4 +- .../offline_mode/test_offline_mode.py | 49 +-- .../openai/correctness/test_lmeval.py | 5 +- tests/kernels/test_attention_selector.py | 129 +++++--- tests/kernels/test_awq.py | 60 ++-- tests/kernels/test_rocm_attention_selector.py | 18 +- tests/kernels/utils.py | 64 ++-- .../{disagg_test.py => test_disagg.py} | 0 .../{module_test.py => test_module.py} | 0 .../models/decoder_only/language/test_fp8.py | 120 +++---- .../models/embedding/language/test_gritlm.py | 96 +++--- tests/models/test_oot_registration.py | 130 ++++---- tests/mq_llm_engine/test_error_handling.py | 31 +- .../multi_step/test_correctness_async_llm.py | 202 ++++++------ tests/multi_step/test_correctness_llm.py | 299 ++++++++--------- tests/neuron/1_core/test_block_table.py | 80 ++--- tests/neuron/1_core/test_prefix_prefill.py | 306 +++++++++--------- tests/plugins_tests/test_platform_plugins.py | 13 +- tests/plugins_tests/test_scheduler_plugins.py | 62 ++-- tests/prefix_caching/test_prefix_caching.py | 111 ++++--- tests/test_regression.py | 16 +- tests/test_utils.py | 63 ++-- tests/tpu/test_custom_dispatcher.py | 25 +- tests/tracing/test_tracing.py | 277 ++++++++-------- tests/utils.py | 11 +- tests/v1/e2e/test_ngram_spec_decode.py | 11 +- tests/v1/engine/test_async_llm.py | 11 +- tests/v1/engine/test_engine_core.py | 10 +- tests/v1/engine/test_engine_core_client.py | 5 +- tests/v1/sample/test_logprobs.py | 224 +++++++------ tests/v1/tpu/test_basic.py | 16 +- 43 files changed, 1900 insertions(+), 1658 deletions(-) delete mode 100644 tests/compile/utils.py rename tests/kv_transfer/{disagg_test.py => test_disagg.py} (100%) rename tests/kv_transfer/{module_test.py => test_module.py} (100%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a6616d7b41480..f85572e7c234c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -522,7 +522,7 @@ steps: # TODO: investigate and fix # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py + - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py - label: Plugin Tests (2 GPUs) # 40min working_dir: "/vllm-workspace/tests" diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 0cb3b739b7245..1458f0893a93c 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -47,6 +47,7 @@ def test_vllm_gc_ed(): @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("enforce_eager", [False]) def test_models( + monkeypatch: pytest.MonkeyPatch, hf_runner, model: str, backend: str, @@ -63,31 +64,33 @@ def test_models( pytest.skip( f"{backend} does not support gemma2 with full context length.") - os.environ["VLLM_ATTENTION_BACKEND"] = backend + with monkeypatch.context() as m: + m.setenv("VLLM_ATTENTION_BACKEND", backend) - # 5042 tokens for gemma2 - # gemma2 has alternating sliding window size of 4096 - # we need a prompt with more than 4096 tokens to test the sliding window - prompt = "The following numbers of the sequence " + ", ".join( - str(i) for i in range(1024)) + " are:" - example_prompts = [prompt] + # 5042 tokens for gemma2 + # gemma2 has alternating sliding window size of 4096 + # we need a prompt with more than 4096 tokens to test the sliding window + prompt = "The following numbers of the sequence " + ", ".join( + str(i) for i in range(1024)) + " are:" + example_prompts = [prompt] - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with VllmRunner(model, - max_model_len=8192, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + with VllmRunner(model, + max_model_len=8192, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @multi_gpu_test(num_gpus=2) @@ -104,6 +107,7 @@ def test_models( ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), ]) def test_models_distributed( + monkeypatch: pytest.MonkeyPatch, hf_runner, vllm_runner, example_prompts, @@ -116,34 +120,41 @@ def test_models_distributed( if test_suite != TARGET_TEST_SUITE: pytest.skip(f"Skip test for {test_suite}") - if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa - # test Ray Compiled Graph - os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" - os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" + with monkeypatch.context() as monkeypatch_context: + if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa + # test Ray Compiled Graph + monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1") + monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1") - if attention_backend: - os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend + if attention_backend: + monkeypatch_context.setenv( + "VLLM_ATTENTION_BACKEND", + attention_backend, + ) - dtype = "half" - max_tokens = 5 + dtype = "half" + max_tokens = 5 - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - with vllm_runner(model, - dtype=dtype, - tensor_parallel_size=2, - distributed_executor_backend=distributed_executor_backend - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method + # (the default method). + with vllm_runner( + model, + dtype=dtype, + tensor_parallel_size=2, + distributed_executor_backend=distributed_executor_backend, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index be007de321c8a..06c9e25ed8dd8 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -7,16 +7,22 @@ prefill requests are chunked. Run `pytest tests/models/test_chunked_prefill.py`. """ -import os + +from __future__ import annotations + +from typing import TYPE_CHECKING import pytest -from tests.kernels.utils import override_backend_env_variable from vllm.platforms import current_platform +from vllm.utils import STR_BACKEND_ENV_VAR from ..models.utils import check_logprobs_close, check_outputs_equal from ..utils import multi_gpu_test +if TYPE_CHECKING: + from .conftest import HfRunner, VllmRunner + MODELS = [ "facebook/opt-125m", "meta-llama/Llama-3.2-1B-Instruct", @@ -24,12 +30,14 @@ MODELS = [ @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): +def use_v0_only(monkeypatch: pytest.MonkeyPatch): """ Since this module is V0 only, set VLLM_USE_V1=0 for all tests in the file. """ - monkeypatch.setenv('VLLM_USE_V1', '0') + with monkeypatch.context() as m: + m.setenv('VLLM_USE_V1', '0') + yield @pytest.mark.parametrize("model", MODELS) @@ -42,8 +50,8 @@ def use_v0_only(monkeypatch): @pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) def test_models( - hf_runner, - vllm_runner, + hf_runner: HfRunner, + vllm_runner: VllmRunner, example_prompts, model: str, dtype: str, @@ -52,37 +60,39 @@ def test_models( enforce_eager: bool, tensor_parallel_size: int, attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """ Checks exact match decode between huggingface model and vllm runner with chunked prefill. """ - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - max_num_seqs = chunked_prefill_token_size - max_num_batched_tokens = chunked_prefill_token_size + max_num_seqs = chunked_prefill_token_size + max_num_batched_tokens = chunked_prefill_token_size - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with vllm_runner( - model, - dtype=dtype, - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=True, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - max_num_seqs=max_num_seqs, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + with vllm_runner( + model, + dtype=dtype, + max_num_batched_tokens=max_num_batched_tokens, + enable_chunked_prefill=True, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + max_num_seqs=max_num_seqs, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @multi_gpu_test(num_gpus=2) @@ -90,57 +100,61 @@ def test_models( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) def test_models_distributed( - hf_runner, - vllm_runner, + hf_runner: HfRunner, + vllm_runner: VllmRunner, example_prompts, model: str, distributed_executor_backend: str, attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) + if (model == "meta-llama/Llama-3.2-1B-Instruct" + and distributed_executor_backend == "ray"): + # test Ray Compiled Graph + m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1") + m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1") - if (model == "meta-llama/Llama-3.2-1B-Instruct" - and distributed_executor_backend == "ray"): - # test Ray Compiled Graph - os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" - os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" + dtype = "half" + max_tokens = 5 + chunked_prefill_token_size = 16 - dtype = "half" - max_tokens = 5 - chunked_prefill_token_size = 16 + # Add a chunked prefill config. + max_num_seqs = min(chunked_prefill_token_size, 256) + assert chunked_prefill_token_size != -1 + enable_chunked_prefill = True + max_num_batched_tokens = chunked_prefill_token_size - # Add a chunked prefill config. - max_num_seqs = min(chunked_prefill_token_size, 256) - assert chunked_prefill_token_size != -1 - enable_chunked_prefill = True - max_num_batched_tokens = chunked_prefill_token_size + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with + # fork method (the default method). - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). + with vllm_runner( + model, + dtype=dtype, + tensor_parallel_size=2, + max_num_seqs=max_num_seqs, + enable_chunked_prefill=enable_chunked_prefill, + max_num_batched_tokens=max_num_batched_tokens, + distributed_executor_backend=distributed_executor_backend, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy( + example_prompts, + max_tokens, + ) - with vllm_runner( - model, - dtype=dtype, - tensor_parallel_size=2, - max_num_seqs=max_num_seqs, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize( @@ -158,7 +172,7 @@ def test_models_distributed( # the async postprocessor @pytest.mark.parametrize("disable_async_output_proc", [True]) def test_models_with_fp8_kv_cache( - vllm_runner, + vllm_runner: VllmRunner, example_prompts, kv_cache_dtype: str, model: str, @@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache( @pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("dtype", ["half"]) def test_with_prefix_caching( - vllm_runner, + vllm_runner: VllmRunner, max_tokens: int, enforce_eager: bool, chunk_size: int, @@ -254,8 +268,10 @@ def test_with_prefix_caching( ) as vllm_model: outputs[enable] = [] for prompt in full_prompts: - outputs[enable] += vllm_model.generate_greedy([prompt], - max_tokens) + outputs[enable] += vllm_model.generate_greedy( + [prompt], + max_tokens, + ) check_outputs_equal( outputs_0_lst=outputs[False], @@ -274,8 +290,8 @@ def test_with_prefix_caching( @pytest.mark.cpu_model @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") def test_models_cpu( - hf_runner, - vllm_runner, + hf_runner: HfRunner, + vllm_runner: VllmRunner, example_prompts, model: str, dtype: str, @@ -283,7 +299,7 @@ def test_models_cpu( chunked_prefill_token_size: int, enforce_eager: bool, attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: test_models( hf_runner, @@ -307,7 +323,7 @@ def test_models_cpu( @pytest.mark.cpu_model @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") def test_with_prefix_caching_cpu( - vllm_runner, + vllm_runner: VllmRunner, max_tokens: int, enforce_eager: bool, chunk_size: int, diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index ba81f2bb79d11..f5ee469fb00a9 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -123,40 +123,38 @@ def test_cumem_with_cudagraph(): # sleep mode with pytorch checkpoint ("facebook/opt-125m", False), ]) -def test_end_to_end(model: str, use_v1: bool): - import os - os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0" - free, total = torch.cuda.mem_get_info() - used_bytes_baseline = total - free # in case other process is running - llm = LLM(model, enable_sleep_mode=True) - prompt = "How are you?" - sampling_params = SamplingParams(temperature=0, max_tokens=10) - output = llm.generate(prompt, sampling_params) +def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + free, total = torch.cuda.mem_get_info() + used_bytes_baseline = total - free # in case other process is running + llm = LLM(model, enable_sleep_mode=True) + prompt = "How are you?" + sampling_params = SamplingParams(temperature=0, max_tokens=10) + output = llm.generate(prompt, sampling_params) - # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, - # which is difficult to measure in the test. therefore, we only - # test sleep level 1 here. - llm.sleep(level=1) + # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, + # which is difficult to measure in the test. therefore, we only + # test sleep level 1 here. + llm.sleep(level=1) - free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() - used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline - # now the memory usage is mostly cudagraph memory pool, - # and it should be less than the model weights (1B model, 2GiB weights) + free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() + used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline + # now the memory usage is mostly cudagraph memory pool, + # and it should be less than the model weights (1B model, 2GiB weights) - # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size) - # is captured but cannot be releasesd from PyTorch due to a known bug, - # therefore high memory usage after `llm.sleep` is called is expected. - # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode - # in V1. - if use_v1: - assert used_bytes < 7 * GiB_bytes - else: - assert used_bytes < 2 * GiB_bytes + # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size) + # is captured but cannot be releasesd from PyTorch due to a known bug, + # therefore high memory usage after `llm.sleep` is called is expected. + # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode + # in V1. + if use_v1: + assert used_bytes < 7 * GiB_bytes + else: + assert used_bytes < 2 * GiB_bytes - llm.wake_up() - output2 = llm.generate(prompt, sampling_params) + llm.wake_up() + output2 = llm.generate(prompt, sampling_params) - # cmp output - assert output[0].outputs[0].text == output2[0].outputs[0].text - - del os.environ["VLLM_USE_V1"] + # cmp output + assert output[0].outputs[0].text == output2[0].outputs[0].text diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 48323b21a8c42..b639fd719ca0a 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import dataclasses -from typing import Optional import pytest @@ -22,75 +22,76 @@ class TestSetting: fullgraph: bool -# representative settings for testing -test_settings = [ - # basic llama model - TestSetting( - model="meta-llama/Llama-3.2-1B-Instruct", - model_args=[], - pp_size=2, - tp_size=2, - attn_backend="FLASHINFER", - method="generate", - fullgraph=True, - ), - # llama model with quantization - TestSetting( - model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", - model_args=["--quantization", "gptq"], - pp_size=1, - tp_size=1, - attn_backend="FLASH_ATTN", - method="generate", - fullgraph=True, - ), - # MoE model - TestSetting( - model="ibm/PowerMoE-3b", - model_args=[], - pp_size=1, - tp_size=2, - attn_backend="FLASH_ATTN", - method="generate", - fullgraph=True, - ), - # embedding model - TestSetting( - model="BAAI/bge-multilingual-gemma2", - model_args=["--task", "embed"], - pp_size=1, - tp_size=1, - attn_backend="FLASH_ATTN", - method="encode", - fullgraph=True, - ), - # encoder-based embedding model (BERT) - TestSetting( - model="BAAI/bge-base-en-v1.5", - model_args=["--task", "embed"], - pp_size=1, - tp_size=1, - attn_backend="XFORMERS", - method="encode", - fullgraph=True, - ), - # vision language model - TestSetting( - model="microsoft/Phi-3.5-vision-instruct", - model_args=["--trust-remote-code", "--max-model-len", "2048"], - pp_size=2, - tp_size=1, - attn_backend="FLASH_ATTN", - method="generate_with_image", - fullgraph=False, - ), -] - - # we cannot afford testing the full Catesian product # of all models and all levels -@pytest.mark.parametrize("test_setting", test_settings) -def test_compile_correctness(test_setting: TestSetting): +@pytest.mark.parametrize( + "test_setting", + [ + # basic llama model + TestSetting( + model="meta-llama/Llama-3.2-1B-Instruct", + model_args=[], + pp_size=2, + tp_size=2, + attn_backend="FLASHINFER", + method="generate", + fullgraph=True, + ), + # llama model with quantization + TestSetting( + model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", + model_args=["--quantization", "gptq"], + pp_size=1, + tp_size=1, + attn_backend="FLASH_ATTN", + method="generate", + fullgraph=True, + ), + # MoE model + TestSetting( + model="ibm/PowerMoE-3b", + model_args=[], + pp_size=1, + tp_size=2, + attn_backend="FLASH_ATTN", + method="generate", + fullgraph=True, + ), + # embedding model + TestSetting( + model="BAAI/bge-multilingual-gemma2", + model_args=["--task", "embed"], + pp_size=1, + tp_size=1, + attn_backend="FLASH_ATTN", + method="encode", + fullgraph=True, + ), + # encoder-based embedding model (BERT) + TestSetting( + model="BAAI/bge-base-en-v1.5", + model_args=["--task", "embed"], + pp_size=1, + tp_size=1, + attn_backend="XFORMERS", + method="encode", + fullgraph=True, + ), + # vision language model + TestSetting( + model="microsoft/Phi-3.5-vision-instruct", + model_args=["--trust-remote-code", "--max-model-len", "2048"], + pp_size=2, + tp_size=1, + attn_backend="FLASH_ATTN", + method="generate_with_image", + fullgraph=False, + ), + ]) +def test_compile_correctness( + monkeypatch: pytest.MonkeyPatch, + test_setting: TestSetting, +): # this test is run under multiple suits, with different GPUs. # make sure we only run the test with correct CUDA devices. # don't use "<", as it will duplicate the tests. @@ -103,41 +104,45 @@ def test_compile_correctness(test_setting: TestSetting): fullgraph = test_setting.fullgraph if cuda_device_count_stateless() != pp_size * tp_size: pytest.skip("Not correct CUDA devices for the test.") - import os - os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend - final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \ - ["-tp", str(tp_size)] - all_args: list[list[str]] = [] - all_envs: list[Optional[dict[str, str]]] = [] + with monkeypatch.context() as m: + m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) + final_args = [ + "--enforce-eager", *model_args, "-pp", + str(pp_size), "-tp", + str(tp_size) + ] - for level in [ - CompilationLevel.NO_COMPILATION, - CompilationLevel.PIECEWISE, - ]: - all_args.append(final_args + [f"-O{level}"]) - all_envs.append({}) + all_args: list[list[str]] = [] + all_envs: list[dict[str, str] | None] = [] - # inductor will change the output, so we only compare if the output - # is close, not exactly the same. - compare_all_settings( - model, - all_args, - all_envs, - method=method if method != "generate" else "generate_close") - all_envs.clear() - all_args.clear() + for level in [ + CompilationLevel.NO_COMPILATION, + CompilationLevel.PIECEWISE, + ]: + all_args.append(final_args + [f"-O{level}"]) + all_envs.append({}) - for level in [ - CompilationLevel.NO_COMPILATION, - CompilationLevel.DYNAMO_AS_IS, - CompilationLevel.DYNAMO_ONCE, - ]: - all_args.append(final_args + [f"-O{level}"]) - all_envs.append({}) - if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: - # "DYNAMO_ONCE" will always use fullgraph - all_envs[-1][ - "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore + # inductor will change the output, so we only compare if the output + # is close, not exactly the same. + compare_all_settings( + model, + all_args, + all_envs, + method=method if method != "generate" else "generate_close") + all_envs.clear() + all_args.clear() - compare_all_settings(model, all_args * 3, all_envs, method=method) + for level in [ + CompilationLevel.NO_COMPILATION, + CompilationLevel.DYNAMO_AS_IS, + CompilationLevel.DYNAMO_ONCE, + ]: + all_args.append(final_args + [f"-O{level}"]) + all_envs.append({}) + if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: + # "DYNAMO_ONCE" will always use fullgraph + all_envs[-1][ + "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore + + compare_all_settings(model, all_args * 3, all_envs, method=method) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 6e83fa36881e4..cf463f3e75254 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -1,22 +1,115 @@ # SPDX-License-Identifier: Apache-2.0 -import pytest +from __future__ import annotations +from typing import Any + +import pytest +import torch + +from tests.quantization.utils import is_quant_method_supported +from vllm import LLM, SamplingParams from vllm.config import CompilationLevel +from vllm.platforms import current_platform from ..utils import fork_new_process_for_each_test -from .utils import TEST_MODELS, check_full_graph_support -@pytest.mark.parametrize("model_info", TEST_MODELS) +@pytest.fixture(params=None, name="model_info") +def models_list_fixture(request): + TEST_MODELS: list[tuple[str, dict[str, Any]]] = [ + ("facebook/opt-125m", {}), + ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { + "dtype": torch.float16, + "quantization": "compressed-tensors" + }), + ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", { + "dtype": torch.float16, + "quantization": "compressed-tensors" + }), + ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", { + "quantization": "compressed-tensors" + }), + ("meta-llama/Llama-3.2-1B-Instruct", {}), + ] + + if is_quant_method_supported("aqlm"): + TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", { + "quantization": "aqlm" + })) + + # TODO: figure out why this fails. + if False and is_quant_method_supported("gguf"): # noqa: SIM223 + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", { + "quantization": "gguf" + })) + + if is_quant_method_supported("gptq"): + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", { + "quantization": "gptq" + })) + + if is_quant_method_supported("gptq_marlin"): + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", { + "quantization": "gptq_marlin" + })) + + if is_quant_method_supported("gptq_marlin_24"): + TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", { + "quantization": "gptq_marlin_24" + })) + + if is_quant_method_supported("marlin"): + TEST_MODELS.append( + ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", { + "quantization": "marlin" + })) + + if not current_platform.is_rocm() and is_quant_method_supported("awq"): + TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { + "quantization": "AWQ" + })) + + return TEST_MODELS + + @pytest.mark.parametrize( "optimization_level", - [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE]) + [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE], +) +@pytest.mark.parametrize("model_info", "", indirect=True) @fork_new_process_for_each_test -def test_full_graph(model_info, optimization_level): - model = model_info[0] - model_kwargs = model_info[1] - check_full_graph_support(model, - model_kwargs, - optimization_level, - tp_size=1) +def test_full_graph( + monkeypatch: pytest.MonkeyPatch, + model_info: tuple[str, dict[str, Any]], + optimization_level: int, +): + model, model_kwargs = model_info + + with monkeypatch.context() as m: + # make sure these models can be captured in full graph mode + m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") + print(f"MODEL={model}") + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0) + llm = LLM( + model=model, + enforce_eager=True, + tensor_parallel_size=1, + disable_custom_all_reduce=True, + compilation_config=optimization_level, + **model_kwargs, + ) + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/tests/compile/utils.py b/tests/compile/utils.py deleted file mode 100644 index fb8270c26b1b0..0000000000000 --- a/tests/compile/utils.py +++ /dev/null @@ -1,93 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import os - -import torch - -from tests.quantization.utils import is_quant_method_supported -from vllm import LLM, SamplingParams -from vllm.platforms import current_platform - -TEST_MODELS = [ - ("facebook/opt-125m", {}), - ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { - "dtype": torch.float16, - "quantization": "compressed-tensors" - }), - ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", { - "dtype": torch.float16, - "quantization": "compressed-tensors" - }), - ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", { - "quantization": "compressed-tensors" - }), - ("meta-llama/Llama-3.2-1B-Instruct", {}), -] - -if is_quant_method_supported("aqlm"): - TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", { - "quantization": "aqlm" - })) - -# TODO: figure out why this fails. -if False and is_quant_method_supported("gguf"): # noqa: SIM223 - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", { - "quantization": "gguf" - })) - -if is_quant_method_supported("gptq"): - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", { - "quantization": "gptq" - })) - -if is_quant_method_supported("gptq_marlin"): - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", { - "quantization": "gptq_marlin" - })) - -if is_quant_method_supported("gptq_marlin_24"): - TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", { - "quantization": "gptq_marlin_24" - })) - -if is_quant_method_supported("marlin"): - TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", { - "quantization": "marlin" - })) - -if not current_platform.is_rocm() and is_quant_method_supported("awq"): - TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { - "quantization": "AWQ" - })) - - -def check_full_graph_support(model, - model_kwargs, - optimization_level, - tp_size=1): - # make sure these models can be captured in full graph mode - os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1" - - print(f"MODEL={model}") - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0) - llm = LLM(model=model, - enforce_eager=True, - tensor_parallel_size=tp_size, - disable_custom_all_reduce=True, - compilation_config=optimization_level, - **model_kwargs) - - outputs = llm.generate(prompts, sampling_params) - - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/tests/conftest.py b/tests/conftest.py index 41c0e62ce14f3..30e5ca2eb137a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1110,4 +1110,4 @@ def pytest_collection_modifyitems(config, items): skip_optional = pytest.mark.skip(reason="need --optional option to run") for item in items: if "optional" in item.keywords: - item.add_marker(skip_optional) + item.add_marker(skip_optional) \ No newline at end of file diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index 7b0346b8ab50f..ac6d6aae30063 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -3,7 +3,10 @@ Run `pytest tests/distributed/test_comm_ops.py`. """ -import os + +from __future__ import annotations + +from typing import Any, Callable import pytest import ray @@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel @ray.remote(num_gpus=1, max_calls=1) -def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, - distributed_init_port: str): +def all_reduce_test_worker( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, +): # it is important to delete the CUDA_VISIBLE_DEVICES environment variable # so that each worker can see all the GPUs # they will be able to set the device to the correct GPU - os.environ.pop("CUDA_VISIBLE_DEVICES", None) + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) + device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, @ray.remote(num_gpus=1, max_calls=1) -def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, - distributed_init_port: str): +def all_gather_test_worker( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, +): # it is important to delete the CUDA_VISIBLE_DEVICES environment variable # so that each worker can see all the GPUs # they will be able to set the device to the correct GPU - os.environ.pop("CUDA_VISIBLE_DEVICES", None) + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, @ray.remote(num_gpus=1, max_calls=1) -def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, - distributed_init_port: str): +def broadcast_tensor_dict_test_worker( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, +): # it is important to delete the CUDA_VISIBLE_DEVICES environment variable # so that each worker can see all the GPUs # they will be able to set the device to the correct GPU - os.environ.pop("CUDA_VISIBLE_DEVICES", None) + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, @ray.remote(num_gpus=1, max_calls=1) -def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, - distributed_init_port: str): - os.environ.pop("CUDA_VISIBLE_DEVICES", None) +def send_recv_tensor_dict_test_worker( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, +): + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, @ray.remote(num_gpus=1, max_calls=1) -def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, - distributed_init_port: str): - os.environ.pop("CUDA_VISIBLE_DEVICES", None) +def send_recv_test_worker( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + pp_size: int, + rank: int, + distributed_init_port: str, +): + monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, all_reduce_test_worker, all_gather_test_worker, broadcast_tensor_dict_test_worker ]) -def test_multi_process_tensor_parallel(tp_size, test_target): - multi_process_parallel(tp_size, 1, test_target) +def test_multi_process_tensor_parallel( + monkeypatch: pytest.MonkeyPatch, + tp_size: int, + test_target: Callable[..., Any], +): + multi_process_parallel(monkeypatch, tp_size, 1, test_target) @pytest.mark.skipif(torch.cuda.device_count() < 2, @@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target): @pytest.mark.parametrize("pp_size", [2]) @pytest.mark.parametrize( "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]) -def test_multi_process_pipeline_parallel(pp_size, test_target): - multi_process_parallel(1, pp_size, test_target) +def test_multi_process_pipeline_parallel( + monkeypatch: pytest.MonkeyPatch, + pp_size: int, + test_target: Callable[..., Any], +): + multi_process_parallel(monkeypatch, 1, pp_size, test_target) @pytest.mark.skipif(torch.cuda.device_count() < 4, @@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target): broadcast_tensor_dict_test_worker ]) def test_multi_process_tensor_parallel_pipeline_parallel( - tp_size, pp_size, test_target): - multi_process_parallel(tp_size, pp_size, test_target) + tp_size: int, + pp_size: int, + test_target: Callable[..., Any], + monkeypatch: pytest.MonkeyPatch, +): + multi_process_parallel(monkeypatch, tp_size, pp_size, test_target) diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 4928690bebb07..bfa7d06c4d075 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import os import random import pytest @@ -23,95 +22,115 @@ for i, v in enumerate(test_sizes): @ray.remote(num_gpus=1, max_calls=1) -def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): - os.environ.pop("CUDA_VISIBLE_DEVICES", None) - device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) - init_test_distributed_environment(tp_size, pp_size, rank, - distributed_init_port) - ensure_model_parallel_initialized(tp_size, pp_size) - group = get_tensor_model_parallel_group().device_group +def graph_allreduce( + monkeypatch: pytest.MonkeyPatch, + tp_size, + pp_size, + rank, + distributed_init_port, +): + with monkeypatch.context() as m: + m.delenv("CUDA_VISIBLE_DEVICES", raising=False) + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(device) + init_test_distributed_environment(tp_size, pp_size, rank, + distributed_init_port) + ensure_model_parallel_initialized(tp_size, pp_size) + group = get_tensor_model_parallel_group().device_group - # A small all_reduce for warmup. - # this is needed because device communicators might be created lazily - # (e.g. NCCL). This will ensure that the communicator is initialized - # before any communication happens, so that this group can be used for - # graph capture immediately. - data = torch.zeros(1) - data = data.to(device=device) - torch.distributed.all_reduce(data, group=group) - torch.cuda.synchronize() - del data + # A small all_reduce for warmup. + # this is needed because device communicators might be created lazily + # (e.g. NCCL). This will ensure that the communicator is initialized + # before any communication happens, so that this group can be used for + # graph capture immediately. + data = torch.zeros(1) + data = data.to(device=device) + torch.distributed.all_reduce(data, group=group) + torch.cuda.synchronize() + del data - # we use the first group to communicate once - # and the second group to communicate twice - # and so on - # this is used to demonstrate that each group can - # communicate independently - num_communication = rank // tp_size + 1 + # we use the first group to communicate once + # and the second group to communicate twice + # and so on + # this is used to demonstrate that each group can + # communicate independently + num_communication = rank // tp_size + 1 - for sz in test_sizes: - for dtype in [torch.float32, torch.float16, torch.bfloat16]: - with graph_capture(device=device) as graph_capture_context: - # use integers so result matches NCCL exactly - inp1 = torch.randint(1, - 16, (sz, ), - dtype=dtype, - device=torch.cuda.current_device()) - inp2 = torch.randint(1, - 16, (sz, ), - dtype=dtype, - device=torch.cuda.current_device()) - torch.cuda.synchronize() - graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(graph, - stream=graph_capture_context.stream): - for i in range(num_communication): - out1 = tensor_model_parallel_all_reduce(inp1) - # the input buffer is immediately modified to test - # synchronization - dist.all_reduce(inp1, group=group) - out2 = tensor_model_parallel_all_reduce(inp2) - dist.all_reduce(inp2, group=group) - graph.replay() - torch.testing.assert_close(out1, inp1) - torch.testing.assert_close(out2, inp2) + for sz in test_sizes: + for dtype in [torch.float32, torch.float16, torch.bfloat16]: + with graph_capture(device=device) as graph_capture_context: + # use integers so result matches NCCL exactly + inp1 = torch.randint(1, + 16, (sz, ), + dtype=dtype, + device=torch.cuda.current_device()) + inp2 = torch.randint(1, + 16, (sz, ), + dtype=dtype, + device=torch.cuda.current_device()) + torch.cuda.synchronize() + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, + stream=graph_capture_context.stream): + for i in range(num_communication): + out1 = tensor_model_parallel_all_reduce(inp1) + # the input buffer is immediately modified to test + # synchronization + dist.all_reduce(inp1, group=group) + out2 = tensor_model_parallel_all_reduce(inp2) + dist.all_reduce(inp2, group=group) + graph.replay() + torch.testing.assert_close(out1, inp1) + torch.testing.assert_close(out2, inp2) @ray.remote(num_gpus=1, max_calls=1) -def eager_allreduce(tp_size, pp_size, rank, distributed_init_port): - os.environ.pop("CUDA_VISIBLE_DEVICES", None) - device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) - init_test_distributed_environment(tp_size, pp_size, rank, - distributed_init_port) +def eager_allreduce( + monkeypatch: pytest.MonkeyPatch, + tp_size, + pp_size, + rank, + distributed_init_port, +): + with monkeypatch.context() as m: + m.delenv("CUDA_VISIBLE_DEVICES", raising=False) + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(device) + init_test_distributed_environment(tp_size, pp_size, rank, + distributed_init_port) - # we use the first group to communicate once - # and the second group to communicate twice - # and so on - # this is used to demonstrate that each group can - # communicate independently - num_communication = rank // tp_size + 1 - sz = 1024 - fa = get_tp_group().ca_comm - inp = torch.ones(sz, dtype=torch.float32, device=device) - out = inp - for _ in range(num_communication): - out = fa.all_reduce(out, registered=False) - torch.testing.assert_close(out, inp * (tp_size**num_communication)) + # we use the first group to communicate once + # and the second group to communicate twice + # and so on + # this is used to demonstrate that each group can + # communicate independently + num_communication = rank // tp_size + 1 + sz = 1024 + fa = get_tp_group().ca_comm + inp = torch.ones(sz, dtype=torch.float32, device=device) + out = inp + for _ in range(num_communication): + out = fa.all_reduce(out, registered=False) + torch.testing.assert_close(out, inp * (tp_size**num_communication)) - inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device) - out = inp - for _ in range(num_communication): - out = fa.all_reduce(out, registered=False) - torch.testing.assert_close(out, inp * (tp_size**num_communication)) + inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device) + out = inp + for _ in range(num_communication): + out = fa.all_reduce(out, registered=False) + torch.testing.assert_close(out, inp * (tp_size**num_communication)) @pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("pipeline_parallel_size", [1, 2]) @pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce]) -def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target): +def test_custom_allreduce( + monkeypatch: pytest.MonkeyPatch, + tp_size, + pipeline_parallel_size, + test_target, +): world_size = tp_size * pipeline_parallel_size if world_size > torch.cuda.device_count(): pytest.skip("Not enough GPUs to run the test.") - multi_process_parallel(tp_size, pipeline_parallel_size, test_target) + multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, + test_target) diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py index 18c5be29c5ce1..7bf93f270148b 100644 --- a/tests/distributed/test_pipeline_partition.py +++ b/tests/distributed/test_pipeline_partition.py @@ -7,33 +7,35 @@ import pytest from vllm.distributed.utils import get_pp_indices -def test_custom_layer_partition(): +def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch): - def _verify(partition_str, num_layers, pp_size, goldens): - bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None) - os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str - for pp_rank, golden in enumerate(goldens): - assert get_pp_indices(num_layers, pp_rank, pp_size) == golden - if bak is not None: - os.environ["VLLM_PP_LAYER_PARTITION"] = bak + with monkeypatch.context() as m: - # Even partition - _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) - # Balanced partition - _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)]) - # Put reminder somewhere - _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)]) - # Invalid partition strings - with pytest.raises(ValueError): - _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) - with pytest.raises(ValueError): - _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) - # Wrong number of partitions - with pytest.raises(ValueError): - _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) - # Wrong number of layers - with pytest.raises(ValueError): - _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) + def _verify(partition_str, num_layers, pp_size, goldens): + bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None) + m.setenv("VLLM_PP_LAYER_PARTITION", partition_str) + for pp_rank, golden in enumerate(goldens): + assert get_pp_indices(num_layers, pp_rank, pp_size) == golden + if bak is not None: + m.setenv("VLLM_PP_LAYER_PARTITION", bak) + + # Even partition + _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) + # Balanced partition + _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)]) + # Put reminder somewhere + _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)]) + # Invalid partition strings + with pytest.raises(ValueError): + _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) + with pytest.raises(ValueError): + _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) + # Wrong number of partitions + with pytest.raises(ValueError): + _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) + # Wrong number of layers + with pytest.raises(ValueError): + _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)]) @pytest.mark.parametrize( @@ -55,6 +57,10 @@ def test_custom_layer_partition(): (5, 3, 1, (2, 4)), (5, 3, 2, (4, 5)), ]) -def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int, - pp_rank: int, indices: tuple[int, int]): +def test_uneven_auto_partition( + num_hidden_layers: int, + pp_size: int, + pp_rank: int, + indices: tuple[int, int], +): assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size) diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py index 3bc85b05e7d15..19414971f2b46 100644 --- a/tests/distributed/test_pp_cudagraph.py +++ b/tests/distributed/test_pp_cudagraph.py @@ -1,11 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations -import os +from typing import TYPE_CHECKING import pytest from ..utils import compare_two_settings, fork_new_process_for_each_test +if TYPE_CHECKING: + from typing_extensions import LiteralString + @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [ (2, "JackFram/llama-160m"), @@ -15,18 +19,24 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test "FLASHINFER", ]) @fork_new_process_for_each_test -def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): - cudagraph_args = [ - # use half precision for speed and memory savings in CI environment - "--dtype", - "float16", - "--pipeline-parallel-size", - str(PP_SIZE), - "--distributed-executor-backend", - "mp", - ] - os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND +def test_pp_cudagraph( + monkeypatch: pytest.MonkeyPatch, + PP_SIZE: int, + MODEL_NAME: str, + ATTN_BACKEND: LiteralString, +): + with monkeypatch.context() as m: + cudagraph_args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "float16", + "--pipeline-parallel-size", + str(PP_SIZE), + "--distributed-executor-backend", + "mp", + ] + m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND) - eager_args = cudagraph_args + ["--enforce-eager"] + eager_args = cudagraph_args + ["--enforce-eager"] - compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) + compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py index 3ebc5a44d80c6..77fbb5827da9e 100644 --- a/tests/entrypoints/llm/test_accuracy.py +++ b/tests/entrypoints/llm/test_accuracy.py @@ -49,7 +49,7 @@ TPU_TP_TEST_STR = "" #"tensor_parallel_size=4" @pytest.mark.skipif(not current_platform.is_cuda() and not current_platform.is_tpu(), reason="V1 is currently only supported on CUDA and TPU") -def test_lm_eval_accuracy_v1_engine(monkeypatch): +def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch): """Run with the V1 Engine.""" with monkeypatch.context() as m: @@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch): run_test(more_args) -def test_lm_eval_accuracy_v0_engine(monkeypatch): +def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch): """Run with the V0 Engine.""" with monkeypatch.context() as m: diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index 85156d6931c8c..23fd72f4ebbb9 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -53,32 +53,37 @@ def cache_models(): @pytest.mark.skip_global_cleanup @pytest.mark.usefixtures("cache_models") -def test_offline_mode(monkeypatch): +def test_offline_mode(monkeypatch: pytest.MonkeyPatch): # Set HF to offline mode and ensure we can still construct an LLM - try: - monkeypatch.setenv("HF_HUB_OFFLINE", "1") - monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1") + with monkeypatch.context() as m: + try: + m.setenv("HF_HUB_OFFLINE", "1") + m.setenv("VLLM_NO_USAGE_STATS", "1") - def disable_connect(*args, **kwargs): - raise RuntimeError("No http calls allowed") + def disable_connect(*args, **kwargs): + raise RuntimeError("No http calls allowed") - monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect", - disable_connect) - monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect", - disable_connect) + m.setattr( + urllib3.connection.HTTPConnection, + "connect", + disable_connect, + ) + m.setattr( + urllib3.connection.HTTPSConnection, + "connect", + disable_connect, + ) - # Need to re-import huggingface_hub and friends to setup offline mode - _re_import_modules() - # Cached model files should be used in offline mode - for model_config in MODEL_CONFIGS: - LLM(**model_config) - finally: - # Reset the environment after the test - # NB: Assuming tests are run in online mode - monkeypatch.delenv("HF_HUB_OFFLINE") - monkeypatch.delenv("VLLM_NO_USAGE_STATS") - _re_import_modules() - pass + # Need to re-import huggingface_hub + # and friends to setup offline mode + _re_import_modules() + # Cached model files should be used in offline mode + for model_config in MODEL_CONFIGS: + LLM(**model_config) + finally: + # Reset the environment after the test + # NB: Assuming tests are run in online mode + _re_import_modules() def _re_import_modules(): diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py index e4c087db3d4f0..d3948e2ed575e 100644 --- a/tests/entrypoints/openai/correctness/test_lmeval.py +++ b/tests/entrypoints/openai/correctness/test_lmeval.py @@ -70,7 +70,7 @@ def run_test(more_args): @pytest.mark.skipif(not current_platform.is_cuda() and not current_platform.is_tpu(), reason="V1 currently only supported on CUDA and TPU") -def test_lm_eval_accuracy_v1_engine(monkeypatch): +def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch): """Run with the V1 Engine.""" with monkeypatch.context() as m: @@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch): @pytest.mark.parametrize("more_args", MORE_ARGS_LIST) -def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args): +def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch, + more_args): """Run with the V0 Engine.""" with monkeypatch.context() as m: diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 570e643e0364d..66db7509cc474 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -5,13 +5,12 @@ from unittest.mock import Mock, patch import pytest import torch -from tests.kernels.utils import override_backend_env_variable from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform from vllm.platforms.openvino import OpenVinoPlatform from vllm.platforms.rocm import RocmPlatform -from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL +from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL @pytest.fixture(autouse=True) @@ -25,87 +24,111 @@ def clear_cache(): "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"]) @pytest.mark.parametrize("use_v1", [True, False]) @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"]) -def test_env(name: str, use_v1: bool, device: str, monkeypatch): +def test_env( + name: str, + use_v1: bool, + device: str, + monkeypatch: pytest.MonkeyPatch, +): """Test that the attention selector can be set via environment variable. Note that we do not test FlashAttn because it is the default backend. """ - monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0") - override_backend_env_variable(monkeypatch, name) + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + m.setenv(STR_BACKEND_ENV_VAR, name) - if device == "cpu": - with patch("vllm.attention.selector.current_platform", CpuPlatform()): - backend = get_attn_backend(16, torch.float16, torch.float16, 16, - False) - assert backend.get_name() == "TORCH_SDPA" - elif device == "hip": - with patch("vllm.attention.selector.current_platform", RocmPlatform()): - backend = get_attn_backend(16, torch.float16, torch.float16, 16, - False) - EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" - assert backend.get_name() == EXPECTED - elif device == "openvino": - with patch("vllm.attention.selector.current_platform", - OpenVinoPlatform()), patch.dict('sys.modules', - {'openvino': Mock()}): - backend = get_attn_backend(16, torch.float16, torch.float16, 16, - False) - assert backend.get_name() == "OPENVINO" - else: - if name in ["XFORMERS", "FLASHINFER"]: + if device == "cpu": with patch("vllm.attention.selector.current_platform", - CudaPlatform()): + CpuPlatform()): backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) - EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name + assert backend.get_name() == "TORCH_SDPA" + elif device == "hip": + with patch("vllm.attention.selector.current_platform", + RocmPlatform()): + backend = get_attn_backend(16, torch.float16, torch.float16, + 16, False) + EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" assert backend.get_name() == EXPECTED + elif device == "openvino": + with patch("vllm.attention.selector.current_platform", + OpenVinoPlatform()), patch.dict('sys.modules', + {'openvino': Mock()}): + backend = get_attn_backend(16, torch.float16, torch.float16, + 16, False) + assert backend.get_name() == "OPENVINO" + else: + if name in ["XFORMERS", "FLASHINFER"]: + with patch("vllm.attention.selector.current_platform", + CudaPlatform()): + backend = get_attn_backend(16, torch.float16, + torch.float16, 16, False) + EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name + assert backend.get_name() == EXPECTED -def test_flash_attn(monkeypatch): +def test_flash_attn(monkeypatch: pytest.MonkeyPatch): """Test FlashAttn validation.""" # TODO: When testing for v1, pipe in `use_v1` as an argument to # get_attn_backend - override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL) - # Unsupported CUDA arch - with patch("torch.cuda.get_device_capability", return_value=(7, 5)): + # Unsupported CUDA arch + monkeypatch.setattr(torch.cuda, "get_device_capability", lambda: + (7, 5)) backend = get_attn_backend(16, torch.float16, None, 16, False) assert backend.get_name() != STR_FLASH_ATTN_VAL - # Unsupported data type - backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL + # Reset the monkeypatch for subsequent tests + monkeypatch.undo() - # Unsupported kv cache data type - backend = get_attn_backend(16, torch.float16, "fp8", 16, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL + # Unsupported data type + backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL - # Unsupported block size - backend = get_attn_backend(16, torch.float16, None, 8, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL + # Unsupported kv cache data type + backend = get_attn_backend(16, torch.float16, "fp8", 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL - # flash-attn is not installed - with patch.dict('sys.modules', {'vllm_flash_attn': None}): + # Unsupported block size + backend = get_attn_backend(16, torch.float16, None, 8, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL + + # flash-attn is not installed + import sys + original_module = sys.modules.get('vllm_flash_attn') + monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None) backend = get_attn_backend(16, torch.float16, None, 16, False) assert backend.get_name() != STR_FLASH_ATTN_VAL - # Unsupported head size - backend = get_attn_backend(17, torch.float16, None, 16, False) - assert backend.get_name() != STR_FLASH_ATTN_VAL + # Restore the original module if it existed + if original_module is not None: + monkeypatch.setitem(sys.modules, 'vllm_flash_attn', + original_module) + else: + monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False) - # Attention-free models should bypass env and use PlaceholderAttention - backend = get_attn_backend(16, torch.float16, torch.float16, 16, True) - assert backend.get_name() != STR_FLASH_ATTN_VAL + # Unsupported head size + backend = get_attn_backend(17, torch.float16, None, 16, False) + assert backend.get_name() != STR_FLASH_ATTN_VAL + + # Attention-free models should bypass env and use PlaceholderAttention + backend = get_attn_backend(16, torch.float16, torch.float16, 16, True) + assert backend.get_name() != STR_FLASH_ATTN_VAL @pytest.mark.parametrize("use_v1", [True, False]) -def test_invalid_env(use_v1: bool, monkeypatch): - """Ignore the invalid env variable if it is set.""" - monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0") - override_backend_env_variable(monkeypatch, STR_INVALID_VAL) +def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch): - with patch("vllm.attention.selector.current_platform", CudaPlatform()): + with monkeypatch.context() as m, patch( + "vllm.attention.selector.current_platform", CudaPlatform()): + m.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) + + # Test with head size 32 backend = get_attn_backend(32, torch.float16, None, 16, False) EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN" assert backend.get_name() == EXPECTED diff --git a/tests/kernels/test_awq.py b/tests/kernels/test_awq.py index 37ce00c74030a..248b294e546b3 100644 --- a/tests/kernels/test_awq.py +++ b/tests/kernels/test_awq.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import os - import pytest import torch @@ -11,36 +9,38 @@ from vllm import _custom_ops as ops # noqa: F401 @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"), reason="AWQ is not supported on this GPU type.") -def test_awq_dequantize_opcheck(): - os.environ["VLLM_USE_TRITON_AWQ"] = "0" - qweight = torch.randint(-2000000000, - 2000000000, (8192, 256), - device='cuda', - dtype=torch.int32) - scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16) - zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32) - split_k_iters = 0 - thx = 0 - thy = 0 - opcheck(torch.ops._C.awq_dequantize, - (qweight, scales, zeros, split_k_iters, thx, thy)) +def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_TRITON_AWQ", "0") + qweight = torch.randint(-2000000000, + 2000000000, (8192, 256), + device='cuda', + dtype=torch.int32) + scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16) + zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32) + split_k_iters = 0 + thx = 0 + thy = 0 + opcheck(torch.ops._C.awq_dequantize, + (qweight, scales, zeros, split_k_iters, thx, thy)) @pytest.mark.skip(reason="Not working; needs investigation.") @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"), reason="AWQ is not supported on this GPU type.") -def test_awq_gemm_opcheck(): - os.environ["VLLM_USE_TRITON_AWQ"] = "0" - input = torch.rand((2, 8192), device='cuda', dtype=torch.float16) - qweight = torch.randint(-2000000000, - 2000000000, (8192, 256), - device='cuda', - dtype=torch.int32) - scales = torch.randint(-2000000000, - 2000000000, (64, 256), - device='cuda', - dtype=torch.int32) - qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16) - split_k_iters = 8 - opcheck(torch.ops._C.awq_gemm, - (input, qweight, qzeros, scales, split_k_iters)) +def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_TRITON_AWQ", "0") + input = torch.rand((2, 8192), device='cuda', dtype=torch.float16) + qweight = torch.randint(-2000000000, + 2000000000, (8192, 256), + device='cuda', + dtype=torch.int32) + scales = torch.randint(-2000000000, + 2000000000, (64, 256), + device='cuda', + dtype=torch.int32) + qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16) + split_k_iters = 8 + opcheck(torch.ops._C.awq_gemm, + (input, qweight, qzeros, scales, split_k_iters)) diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py index 7cd6082486605..724f0af283f70 100644 --- a/tests/kernels/test_rocm_attention_selector.py +++ b/tests/kernels/test_rocm_attention_selector.py @@ -1,13 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import patch - import pytest import torch -from tests.kernels.utils import override_backend_env_variable from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.platforms.rocm import RocmPlatform +from vllm.utils import STR_BACKEND_ENV_VAR @pytest.fixture(autouse=True) @@ -17,15 +15,19 @@ def clear_cache(): _cached_get_attn_backend.cache_clear() -def test_selector(monkeypatch): - """Test that the attention selector for ROCm. - """ - override_backend_env_variable(monkeypatch, "ROCM_FLASH") +def test_selector(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH") - with patch("vllm.attention.selector.current_platform", RocmPlatform()): + # Set the current platform to ROCm using monkeypatch + monkeypatch.setattr("vllm.attention.selector.current_platform", + RocmPlatform()) + + # Test standard ROCm attention backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) assert (backend.get_name() == "ROCM_FLASH" or backend.get_name() == "ROCM_ATTN_VLLM_V1") + # mla test for deepseek related backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, False, True) diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 010974076ba8f..22b3d7c2be7a5 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -36,12 +36,12 @@ ALL_OPCHECK_TEST_UTILS: tuple[str, ...] = ( class QKVInputs(NamedTuple): ''' - Data structure for representing unpacked attention inputs, + Data structure for representing unpacked attention inputs, query/key/values and their sequence lengths. Attributes: - * {query,key,value}: unpacked (batch_size x padded_seq_len x + * {query,key,value}: unpacked (batch_size x padded_seq_len x num_heads x head_size) attention inputs * q_seq_lens: query sequence lengths list * kv_seq_lens: shared key/value sequence lengths list @@ -56,14 +56,14 @@ class QKVInputs(NamedTuple): class QKVO(NamedTuple): ''' - Data structure for representing unpacked attention inputs, + Data structure for representing unpacked attention inputs, alongside unpacked known-correct attention output Attributes: - * qkv: unpacked (batch_size x padded_seq_len x + * qkv: unpacked (batch_size x padded_seq_len x num_heads x head_size) attention inputs - * ideal_output: unpacked (batch_size x padded_seq_len x + * ideal_output: unpacked (batch_size x padded_seq_len x num_heads x head_size) known-correct attention output ''' @@ -77,7 +77,7 @@ class PackedQKVInputs(NamedTuple): Attributes: - * {query,key,value}: packed (number_of_tokens x num_heads + * {query,key,value}: packed (number_of_tokens x num_heads x head_size) attention inputs * q_start_loc_list: list of query start locations within packed tensor * kv_start_loc_list: shared list of key/value start locations within @@ -97,14 +97,14 @@ class PackedQKVInputs(NamedTuple): class PackedQKVO(NamedTuple): ''' - Data structure for representing packed attention inputs, + Data structure for representing packed attention inputs, alongside packed known-correct attention output Attributes: - * packed_qkv: packed (number_of_tokens x num_heads + * packed_qkv: packed (number_of_tokens x num_heads x head_size) attention inputs - * ideal_output: packed (number_of_tokens x num_heads + * ideal_output: packed (number_of_tokens x num_heads x head_size) known-correct attention output ''' @@ -134,7 +134,7 @@ class PhaseTestParameters(NamedTuple): Attributes: - * packed_qkvo: packed (number_of_tokens x num_heads + * packed_qkvo: packed (number_of_tokens x num_heads x head_size) attention inputs & known-correct output * kv_mmap: KV cache memory mapping, specific to this test phase & @@ -195,7 +195,7 @@ def make_causal_mask( Create a q_max_seq_len x kv_max_seq_len causal mask Arguments: - + * q_max_seq_len: query max seq len * kv_max_seq_len: key/value max seq len @@ -320,9 +320,9 @@ def make_qkv( * max_kv_seq_len: max key/value seq len * num_heads * head_size - * is_encoder_decoder_attn: if True, query seqlen may differ from - key/value seqlen (as is often the case for cross-attention); - o/w, query/key/value seqlens match at each batch index + * is_encoder_decoder_attn: if True, query seqlen may differ from + key/value seqlen (as is often the case for cross-attention); + o/w, query/key/value seqlens match at each batch index (max_kv_seq_len is unused) * force_kv_seq_lens: if not None, overrides kv sequence lengths * attn_type: encoder, decoder self, or enc/dec cross attention @@ -469,7 +469,7 @@ def pack_qkv(qkv: QKVInputs, device: Union[torch.device, Individually pack each of Q, K and V, each with dimensions batch_size x padded_seq_len x num_heads x head_size, into respective number_of_tokens x num_heads x head_size tensors. - + For Q, number_of_tokens = sum(q_seq_lens). For K and V, number_of_tokens = sum(kv_seq_lens) @@ -619,9 +619,9 @@ def make_kv_cache(num_blocks: int, Returns: * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size) - * for backend 'XFORMERS' + * for backend 'XFORMERS' * kv_cache: 2 x num_blocks x block_size x num_heads x head_size - * for backend 'FLASH_ATTN' + * for backend 'FLASH_ATTN' ''' if backend == 'XFORMERS': kv_cache = torch.rand( @@ -662,20 +662,20 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int], Context: * Your goal is to test (1) prefill of N prompts, with prompt-lengths {K_i \\forall i \\in [0,N)}, followed by (2) decoding of a single token - for all N prompts (N tokens total); the resultant sequence lengths + for all N prompts (N tokens total); the resultant sequence lengths after decode would be {K_i + 1 for i \\in [0,N)} - * The test you want to do requires (1) having the prefill slot mapping - for all tokens present during prefill, the number of which is - M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N + * The test you want to do requires (1) having the prefill slot mapping + for all tokens present during prefill, the number of which is + M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N decoded tokens - - This function consumes a single 1D slot mapping, which is the + + This function consumes a single 1D slot mapping, which is the concatenation of N slot mappings each of length K_i + 1 (corresponding to the sequence lengths after decode), with a total length of P = \\sum_i{K_i + 1} = M + N The prefill-phase slot mapping results from excising the (K_i + 1)-th entry - from each of the N subsequences in the slot mapping (i.e. omitting the + from each of the N subsequences in the slot mapping (i.e. omitting the decoded token's mapping.) The N excised entries are appended to obtain the decode-phase slot mapping @@ -684,15 +684,15 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int], * slot_mapping_list: Length-P 1D slot mapping (as list) reflecting all N post-decode sequences - * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the + * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the description above) * device: cuda, cpu, etc. Returns: - * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor) + * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor) reflecting all N prefill prompts - * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting + * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting all N decoded tokens ''' @@ -725,7 +725,7 @@ def make_block_tables_slot_mapping( Then the minimum KV cache size in blocks is - total_cache_blocks = sum(num_blocks for all seqs) + total_cache_blocks = sum(num_blocks for all seqs) Then, the blocktable mapping counts downward from @@ -734,7 +734,7 @@ def make_block_tables_slot_mapping( to block_base_addr - + The constructed block-tables and slot-mapping are sized to the lengths of the sequences in their entirety (as reflected by seq_lens), @@ -749,7 +749,7 @@ def make_block_tables_slot_mapping( Return: - * block_tables_tensor: block table for sequence + * block_tables_tensor: block table for sequence * slot_mapping_list: slot mapping for sequence * max_block_idx: the highest block address within this block table ''' @@ -807,7 +807,7 @@ def make_test_metadata( encoder_test_params and cross_test_params arguments allow encoder attention and enc/dec cross-attention (respectively) to use distinct metadata values from decoder self-attention (decoder_test_params.) - + if encoder_test_params and cross_test_params are None, the attention metadata will support decoder-only scenario. @@ -820,7 +820,7 @@ def make_test_metadata( * attn_backend_name: Backend for sourcing attention kernels * is_prompt: prefill if True, o/w decode * seq_lens: list of token counts for each sequence - * decoder_test_params: decoder self-attention test params; + * decoder_test_params: decoder self-attention test params; this function requires kv_mmap (memory mapping) field * device: CPU or CUDA device diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/test_disagg.py similarity index 100% rename from tests/kv_transfer/disagg_test.py rename to tests/kv_transfer/test_disagg.py diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/test_module.py similarity index 100% rename from tests/kv_transfer/module_test.py rename to tests/kv_transfer/test_module.py diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py index faca7a566e79c..51abcb7172cb7 100644 --- a/tests/models/decoder_only/language/test_fp8.py +++ b/tests/models/decoder_only/language/test_fp8.py @@ -12,11 +12,10 @@ import pytest from tests.kernels.utils import override_backend_env_variable from tests.quantization.utils import is_quant_method_supported from vllm.platforms import current_platform +from vllm.utils import STR_BACKEND_ENV_VAR from ...utils import check_logprobs_close -os.environ["TOKENIZERS_PARALLELISM"] = "true" - @pytest.mark.quant_model @pytest.mark.skipif(not is_quant_method_supported("fp8"), @@ -55,45 +54,47 @@ def test_models( backend: str, tensor_parallel_size: int, disable_async_output_proc: bool, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """ Only checks log probs match to cover the discrepancy in numerical sensitive kernels. """ - override_backend_env_variable(monkeypatch, backend) + with monkeypatch.context() as m: + m.setenv("TOKENIZERS_PARALLELISM", 'true') + m.setenv(STR_BACKEND_ENV_VAR, backend) - MAX_MODEL_LEN = 1024 - NUM_LOG_PROBS = 8 + MAX_MODEL_LEN = 1024 + NUM_LOG_PROBS = 8 - with vllm_runner( - base_model, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - kv_cache_dtype="auto", - disable_async_output_proc=disable_async_output_proc, - ) as vllm_model: - baseline_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) + with vllm_runner( + base_model, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + kv_cache_dtype="auto", + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + baseline_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - with vllm_runner( - test_model, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - disable_async_output_proc=disable_async_output_proc, - ) as vllm_model: - test_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) + with vllm_runner( + test_model, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + kv_cache_dtype=kv_cache_dtype, + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + test_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - check_logprobs_close( - outputs_0_lst=baseline_outputs, - outputs_1_lst=test_outputs, - name_0="fp16_kv_cache", - name_1="fp8_kv_cache", - ) + check_logprobs_close( + outputs_0_lst=baseline_outputs, + outputs_1_lst=test_outputs, + name_0="fp16_kv_cache", + name_1="fp8_kv_cache", + ) @pytest.mark.cpu_model @@ -119,38 +120,41 @@ def test_cpu_models( test_model: str, max_tokens: int, disable_async_output_proc: bool, + monkeypatch: pytest.MonkeyPatch, ) -> None: """ Only checks log probs match to cover the discrepancy in numerical sensitive kernels. """ + with monkeypatch.context() as m: + m.setenv("TOKENIZERS_PARALLELISM", 'true') - MAX_MODEL_LEN = 1024 - NUM_LOG_PROBS = 8 + MAX_MODEL_LEN = 1024 + NUM_LOG_PROBS = 8 - with vllm_runner( - base_model, - max_model_len=MAX_MODEL_LEN, - dtype="bfloat16", - kv_cache_dtype="auto", - disable_async_output_proc=disable_async_output_proc, - ) as vllm_model: - baseline_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) + with vllm_runner( + base_model, + max_model_len=MAX_MODEL_LEN, + dtype="bfloat16", + kv_cache_dtype="auto", + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + baseline_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - with vllm_runner( - test_model, - max_model_len=MAX_MODEL_LEN, - dtype="bfloat16", - kv_cache_dtype=kv_cache_dtype, - disable_async_output_proc=disable_async_output_proc, - ) as vllm_model: - test_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) + with vllm_runner( + test_model, + max_model_len=MAX_MODEL_LEN, + dtype="bfloat16", + kv_cache_dtype=kv_cache_dtype, + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + test_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - check_logprobs_close( - outputs_0_lst=baseline_outputs, - outputs_1_lst=test_outputs, - name_0="bf16_kv_cache", - name_1="fp8_kv_cache", - ) + check_logprobs_close( + outputs_0_lst=baseline_outputs, + outputs_1_lst=test_outputs, + name_0="bf16_kv_cache", + name_1="fp8_kv_cache", + ) diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py index cae3e1a5c6244..d6bf7d2706397 100644 --- a/tests/models/embedding/language/test_gritlm.py +++ b/tests/models/embedding/language/test_gritlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import importlib.util import math @@ -11,6 +12,7 @@ from scipy.spatial.distance import cosine import vllm import vllm.config +from vllm.utils import STR_BACKEND_ENV_VAR from ....utils import RemoteOpenAIServer @@ -29,36 +31,34 @@ def _arr(arr): return array("i", arr) -def test_find_array(monkeypatch): +def test_find_array(monkeypatch: pytest.MonkeyPatch): # GritLM embedding implementation is only supported by XFormers backend. - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") - from vllm.model_executor.models.gritlm import GritLMPooler + from vllm.model_executor.models.gritlm import GritLMPooler - # Create an LLM object to get the model config. - llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) - pooler = GritLMPooler(model_config=llm.llm_engine.model_config) + # Create an LLM object to get the model config. + llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) + pooler = GritLMPooler(model_config=llm.llm_engine.model_config) - arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) - assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3 - assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3 - assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1 - assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1 + assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3 + assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3 + assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1 + assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1 - with pytest.raises(ValueError): - pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1) + with pytest.raises(ValueError): + pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1) @pytest.fixture(scope="module") def server_embedding(): # GritLM embedding implementation is only supported by XFormers backend. - with pytest.MonkeyPatch.context() as mp: - mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") - - args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)] - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: - yield remote_server + args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)] + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server @pytest.fixture(scope="module") @@ -69,9 +69,12 @@ def server_generate(): @pytest_asyncio.fixture -async def client_embedding(server_embedding: RemoteOpenAIServer): - async with server_embedding.get_async_client() as async_client: - yield async_client +async def client_embedding(monkeypatch: pytest.MonkeyPatch, + server_embedding: RemoteOpenAIServer): + with monkeypatch.context() as m: + m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") + async with server_embedding.get_async_client() as async_client: + yield async_client @pytest_asyncio.fixture @@ -80,14 +83,20 @@ async def client_generate(server_generate: RemoteOpenAIServer): yield async_client -def run_llm_encode(llm: vllm.LLM, queries: list[str], - instruction: str) -> list[float]: +def run_llm_encode( + llm: vllm.LLM, + queries: list[str], + instruction: str, +) -> list[float]: outputs = llm.encode([instruction + q for q in queries], ) return [output.outputs.embedding for output in outputs] -async def run_client_embeddings(client: vllm.LLM, queries: list[str], - instruction: str) -> list[float]: +async def run_client_embeddings( + client: vllm.LLM, + queries: list[str], + instruction: str, +) -> list[float]: outputs = await client.embeddings.create( model=MODEL_NAME, input=[instruction + q for q in queries], @@ -106,7 +115,7 @@ def get_test_data(): README.md in https://github.com/ContextualAI/gritlm """ q_instruction = gritlm_instruction( - "Given a scientific paper title, retrieve the paper's abstract") + "Given a scientific paper title, retrieve the paper's abstract", ) queries = [ "Bitcoin: A Peer-to-Peer Electronic Cash System", "Generative Representational Instruction Tuning", @@ -136,31 +145,32 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]): assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001) -def test_gritlm_offline_embedding(monkeypatch): +def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch): # GritLM embedding implementation is only supported by XFormers backend. - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") - queries, q_instruction, documents, d_instruction = get_test_data() + queries, q_instruction, documents, d_instruction = get_test_data() - llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) + llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) - d_rep = run_llm_encode( - llm, - documents, - d_instruction, - ) - q_rep = run_llm_encode( - llm, - queries, - q_instruction, - ) + d_rep = run_llm_encode( + llm, + documents, + d_instruction, + ) + q_rep = run_llm_encode( + llm, + queries, + q_instruction, + ) - validate_embed_output(q_rep, d_rep) + validate_embed_output(q_rep, d_rep) @pytest.mark.asyncio async def test_gritlm_api_server_embedding( - client_embedding: openai.AsyncOpenAI): + client_embedding: openai.AsyncOpenAI, ): queries, q_instruction, documents, d_instruction = get_test_data() d_rep = await run_client_embeddings( diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index d3d07d0d9acfc..465c496f4c0f3 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import os - import pytest from vllm import LLM, SamplingParams @@ -11,76 +9,92 @@ from ..utils import fork_new_process_for_each_test @fork_new_process_for_each_test -def test_plugin(dummy_opt_path, monkeypatch): +def test_plugin( + monkeypatch: pytest.MonkeyPatch, + dummy_opt_path: str, +): # V1 shuts down rather than raising an error here. - monkeypatch.setenv("VLLM_USE_V1", "0") - os.environ["VLLM_PLUGINS"] = "" - with pytest.raises(Exception) as excinfo: - LLM(model=dummy_opt_path, load_format="dummy") - error_msg = "has no vLLM implementation and " \ - "the Transformers implementation is not compatible with vLLM" - assert (error_msg in str(excinfo.value)) + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + m.setenv("VLLM_PLUGINS", "") + + with pytest.raises(Exception) as excinfo: + LLM(model=dummy_opt_path, load_format="dummy") + error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM" # noqa: E501 + assert (error_msg in str(excinfo.value)) @fork_new_process_for_each_test -def test_oot_registration_text_generation(dummy_opt_path): - os.environ["VLLM_PLUGINS"] = "register_dummy_model" - prompts = ["Hello, my name is", "The text does not matter"] - sampling_params = SamplingParams(temperature=0) - llm = LLM(model=dummy_opt_path, load_format="dummy") - first_token = llm.get_tokenizer().decode(0) - outputs = llm.generate(prompts, sampling_params) +def test_oot_registration_text_generation( + monkeypatch: pytest.MonkeyPatch, + dummy_opt_path: str, +): + with monkeypatch.context() as m: + m.setenv("VLLM_PLUGINS", "register_dummy_model") + prompts = ["Hello, my name is", "The text does not matter"] + sampling_params = SamplingParams(temperature=0) + llm = LLM(model=dummy_opt_path, load_format="dummy") + first_token = llm.get_tokenizer().decode(0) + outputs = llm.generate(prompts, sampling_params) - for output in outputs: - generated_text = output.outputs[0].text - # make sure only the first token is generated - rest = generated_text.replace(first_token, "") - assert rest == "" + for output in outputs: + generated_text = output.outputs[0].text + # make sure only the first token is generated + rest = generated_text.replace(first_token, "") + assert rest == "" @fork_new_process_for_each_test -def test_oot_registration_embedding(dummy_gemma2_embedding_path): - os.environ["VLLM_PLUGINS"] = "register_dummy_model" - prompts = ["Hello, my name is", "The text does not matter"] - llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy") - outputs = llm.embed(prompts) +def test_oot_registration_embedding( + monkeypatch: pytest.MonkeyPatch, + dummy_gemma2_embedding_path: str, +): + with monkeypatch.context() as m: + m.setenv("VLLM_PLUGINS", "register_dummy_model") + prompts = ["Hello, my name is", "The text does not matter"] + llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy") + outputs = llm.embed(prompts) - for output in outputs: - assert all(v == 0 for v in output.outputs.embedding) + for output in outputs: + assert all(v == 0 for v in output.outputs.embedding) image = ImageAsset("cherry_blossom").pil_image.convert("RGB") @fork_new_process_for_each_test -def test_oot_registration_multimodal(dummy_llava_path, monkeypatch): - os.environ["VLLM_PLUGINS"] = "register_dummy_model" - prompts = [{ - "prompt": "What's in the image?", - "multi_modal_data": { - "image": image - }, - }, { - "prompt": "Describe the image", - "multi_modal_data": { - "image": image - }, - }] +def test_oot_registration_multimodal( + monkeypatch: pytest.MonkeyPatch, + dummy_llava_path: str, +): + with monkeypatch.context() as m: + m.setenv("VLLM_PLUGINS", "register_dummy_model") + prompts = [{ + "prompt": "What's in the image?", + "multi_modal_data": { + "image": image + }, + }, { + "prompt": "Describe the image", + "multi_modal_data": { + "image": image + }, + }] - sampling_params = SamplingParams(temperature=0) - llm = LLM(model=dummy_llava_path, - load_format="dummy", - max_num_seqs=1, - trust_remote_code=True, - gpu_memory_utilization=0.98, - max_model_len=4096, - enforce_eager=True, - limit_mm_per_prompt={"image": 1}) - first_token = llm.get_tokenizer().decode(0) - outputs = llm.generate(prompts, sampling_params) + sampling_params = SamplingParams(temperature=0) + llm = LLM(model=dummy_llava_path, + load_format="dummy", + max_num_seqs=1, + trust_remote_code=True, + gpu_memory_utilization=0.98, + max_model_len=4096, + enforce_eager=True, + limit_mm_per_prompt={"image": 1}) + first_token = llm.get_tokenizer().decode(0) + outputs = llm.generate(prompts, sampling_params) - for output in outputs: - generated_text = output.outputs[0].text - # make sure only the first token is generated - rest = generated_text.replace(first_token, "") - assert rest == "" + for output in outputs: + generated_text = output.outputs[0].text + # make sure only the first token is generated + rest = generated_text.replace(first_token, "") + assert rest == "" diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py index aad7fc5303c13..e617bd057f1f4 100644 --- a/tests/mq_llm_engine/test_error_handling.py +++ b/tests/mq_llm_engine/test_error_handling.py @@ -235,25 +235,28 @@ async def test_bad_request(tmp_socket): @pytest.mark.asyncio -async def test_mp_crash_detection(monkeypatch): +async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: - parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.") - parser = make_arg_parser(parser) - args = parser.parse_args([]) + parser = FlexibleArgumentParser( + description="vLLM's remote OpenAI server.") + parser = make_arg_parser(parser) + args = parser.parse_args([]) - # When LLMEngine is loaded, it will crash. - def mock_init(): - raise ValueError + # When LLMEngine is loaded, it will crash. + def mock_init(): + raise ValueError - monkeypatch.setattr(LLMEngine, "__init__", mock_init) + m.setattr(LLMEngine, "__init__", mock_init) - start = time.perf_counter() - async with build_async_engine_client(args): - pass - end = time.perf_counter() + start = time.perf_counter() + async with build_async_engine_client(args): + pass + end = time.perf_counter() - assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s " - "if there is an error in the startup.") + assert end - start < 60, ( + "Expected vLLM to gracefully shutdown in <60s " + "if there is an error in the startup.") @pytest.mark.asyncio diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index f925e42f46d37..ce716e6474cb4 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -5,7 +5,7 @@ from typing import Optional import pytest -from tests.kernels.utils import override_backend_env_variable +from vllm.utils import STR_BACKEND_ENV_VAR from ..models.utils import check_logprobs_close from ..utils import (completions_with_server_args, get_client_text_generations, @@ -52,7 +52,7 @@ async def test_multi_step( num_logprobs: Optional[int], attention_backend: str, enable_chunked_prefill: bool, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """Test vLLM engine with multi-step scheduling in an OpenAI-protocol client/server environment. @@ -82,67 +82,70 @@ async def test_multi_step( pytest.skip("Multi-step with Chunked-Prefill only supports" "PP=1 and FLASH_ATTN backend") - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - prompts = example_prompts - if len(prompts) < num_prompts: - prompts = prompts * ((num_prompts // len(prompts)) + 1) - prompts = prompts[:num_prompts] - assert len(prompts) == num_prompts + prompts = example_prompts + if len(prompts) < num_prompts: + prompts = prompts * ((num_prompts // len(prompts)) + 1) + prompts = prompts[:num_prompts] + assert len(prompts) == num_prompts - server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"] - ms_server_args = DEFAULT_SERVER_ARGS + \ - ["--num-scheduler-steps", f"{num_scheduler_steps}"] + server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"] + ms_server_args = DEFAULT_SERVER_ARGS + \ + ["--num-scheduler-steps", f"{num_scheduler_steps}"] - if not is_async: - ms_server_args += ["--disable-async-output-proc"] + if not is_async: + ms_server_args += ["--disable-async-output-proc"] - if eager_mode: - ms_server_args.append("--enforce-eager") + if eager_mode: + ms_server_args.append("--enforce-eager") - if enable_chunked_prefill: - ms_server_args.append("--enable-chunked-prefill") + if enable_chunked_prefill: + ms_server_args.append("--enable-chunked-prefill") - distributed_args = [ - "--tensor-parallel-size", - str(tp_size), - "--pipeline-parallel-size", - str(pp_size), - ] + distributed_args = [ + "--tensor-parallel-size", + str(tp_size), + "--pipeline-parallel-size", + str(pp_size), + ] - # Spin up client/server & issue completion API requests. - # Default `max_wait_seconds` is 240 but was empirically - # was raised 5x to 1200 *just for this test* due to - # observed timeouts in GHA CI - ref_completions = await completions_with_server_args( - prompts, - model, - server_args + distributed_args, - num_logprobs, - max_wait_seconds=5 * 240) - test_completions = await completions_with_server_args( - prompts, - model, - ms_server_args + distributed_args, - num_logprobs, - max_wait_seconds=5 * 240) + # Spin up client/server & issue completion API requests. + # Default `max_wait_seconds` is 240 but was empirically + # was raised 5x to 1200 *just for this test* due to + # observed timeouts in GHA CI + ref_completions = await completions_with_server_args( + prompts, + model, + server_args + distributed_args, + num_logprobs, + max_wait_seconds=5 * 240) + test_completions = await completions_with_server_args( + prompts, + model, + ms_server_args + distributed_args, + num_logprobs, + max_wait_seconds=5 * 240) - # Assert multi-step scheduling produces identical tokens - # to single-step scheduling. - ref_generations = get_client_text_generations(ref_completions) - test_generations = get_client_text_generations(test_completions) - assert ref_generations == test_generations + # Assert multi-step scheduling produces identical tokens + # to single-step scheduling. + ref_generations = get_client_text_generations(ref_completions) + test_generations = get_client_text_generations(test_completions) + assert ref_generations == test_generations - # Assert multi-step scheduling produces nearly-identical logprobs - # to single-step scheduling. - ref_text_logprobs = get_client_text_logprob_generations(ref_completions) - test_text_logprobs = get_client_text_logprob_generations(test_completions) - check_logprobs_close( - outputs_0_lst=ref_text_logprobs, - outputs_1_lst=test_text_logprobs, - name_0="hf", - name_1="vllm", - ) + # Assert multi-step scheduling produces nearly-identical logprobs + # to single-step scheduling. + ref_text_logprobs = get_client_text_logprob_generations( + ref_completions) + test_text_logprobs = get_client_text_logprob_generations( + test_completions) + check_logprobs_close( + outputs_0_lst=ref_text_logprobs, + outputs_1_lst=test_text_logprobs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize(("tp_size, pp_size"), [ @@ -152,7 +155,7 @@ async def test_multi_step( async def test_multi_step_pp_smoke( tp_size: int, pp_size: int, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """ Smoke test for the vLLM engine with multi-step scheduling in an @@ -174,54 +177,55 @@ async def test_multi_step_pp_smoke( attention_backend = "FLASH_ATTN" max_num_seqs = 3 - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - # Prompt from the ShareGPT dataset - prompts = [ - "in the jtbd context whats a push?", # codespell:ignore - "in the jtbd context whats a push?", # codespell:ignore - "in the jtbd context whats a push?", # codespell:ignore - "in the jtbd context whats a push?", # codespell:ignore - ] - # Use varying max_tokens to introduce scheduling randomness. - max_tokens = [10 * i for i in range(1, len(prompts) + 1)] - assert len(prompts) == len(max_tokens) + # Prompt from the ShareGPT dataset + prompts = [ + "in the jtbd context whats a push?", # codespell:ignore + "in the jtbd context whats a push?", # codespell:ignore + "in the jtbd context whats a push?", # codespell:ignore + "in the jtbd context whats a push?", # codespell:ignore + ] + # Use varying max_tokens to introduce scheduling randomness. + max_tokens = [10 * i for i in range(1, len(prompts) + 1)] + assert len(prompts) == len(max_tokens) - test_args = [ - "--tensor-parallel-size", - str(tp_size), "--pipeline-parallel-size", - str(pp_size), "--max-num-seqs", - str(max_num_seqs) - ] + test_args = [ + "--tensor-parallel-size", + str(tp_size), "--pipeline-parallel-size", + str(pp_size), "--max-num-seqs", + str(max_num_seqs) + ] - server_args = DEFAULT_SERVER_ARGS + test_args - ms_server_args = DEFAULT_SERVER_ARGS + \ - ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \ - test_args + server_args = DEFAULT_SERVER_ARGS + test_args + ms_server_args = DEFAULT_SERVER_ARGS + \ + ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \ + test_args - # Spin up client/server & issue completion API requests. - # Default `max_wait_seconds` is 240 but was empirically - # was raised 3x to 720 *just for this test* due to - # observed timeouts in GHA CI - ref_completions = await completions_with_server_args( - prompts=prompts, - model_name=model, - server_cli_args=server_args, - num_logprobs=None, - max_wait_seconds=5 * 240, - max_tokens=max_tokens) + # Spin up client/server & issue completion API requests. + # Default `max_wait_seconds` is 240 but was empirically + # was raised 3x to 720 *just for this test* due to + # observed timeouts in GHA CI + ref_completions = await completions_with_server_args( + prompts=prompts, + model_name=model, + server_cli_args=server_args, + num_logprobs=None, + max_wait_seconds=5 * 240, + max_tokens=max_tokens) - test_completions = await completions_with_server_args( - prompts=prompts, - model_name=model, - server_cli_args=ms_server_args, - num_logprobs=None, - max_wait_seconds=5 * 240, - max_tokens=max_tokens) + test_completions = await completions_with_server_args( + prompts=prompts, + model_name=model, + server_cli_args=ms_server_args, + num_logprobs=None, + max_wait_seconds=5 * 240, + max_tokens=max_tokens) - # Assert multi-step scheduling produces identical tokens - # to single-step scheduling. - ref_generations = get_client_text_generations(ref_completions) - test_generations = get_client_text_generations(test_completions) + # Assert multi-step scheduling produces identical tokens + # to single-step scheduling. + ref_generations = get_client_text_generations(ref_completions) + test_generations = get_client_text_generations(test_completions) - assert ref_generations == test_generations + assert ref_generations == test_generations diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index 29d5ffd4c9cb1..a823e484beab6 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -7,7 +7,7 @@ from typing import Optional import pytest -from tests.kernels.utils import override_backend_env_variable +from vllm.utils import STR_BACKEND_ENV_VAR from ..models.utils import check_logprobs_close, check_outputs_equal @@ -42,7 +42,7 @@ def test_multi_step_llm( num_prompts: int, num_logprobs: Optional[int], attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """Test vLLM engine with multi-step scheduling via sync LLM Engine. @@ -70,48 +70,49 @@ def test_multi_step_llm( num_logprobs: corresponds to the `logprobs` argument to the OpenAI completions endpoint; `None` -> 1 logprob returned. """ - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - prompts = example_prompts - if len(prompts) < num_prompts: - prompts = prompts * ((num_prompts // len(prompts)) + 1) - prompts = prompts[:num_prompts] - assert len(prompts) == num_prompts + prompts = example_prompts + if len(prompts) < num_prompts: + prompts = prompts * ((num_prompts // len(prompts)) + 1) + prompts = prompts[:num_prompts] + assert len(prompts) == num_prompts - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - enable_chunked_prefill=enable_chunked_prefill, - num_scheduler_steps=num_scheduler_steps, - ) as vllm_model: - vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens) - if num_logprobs is None else - vllm_model.generate_greedy_logprobs( - prompts, max_tokens, num_logprobs)) + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + enable_chunked_prefill=enable_chunked_prefill, + num_scheduler_steps=num_scheduler_steps, + ) as vllm_model: + vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens) + if num_logprobs is None else + vllm_model.generate_greedy_logprobs( + prompts, max_tokens, num_logprobs)) - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = (hf_model.generate_greedy(prompts, max_tokens) - if num_logprobs is None else - hf_model.generate_greedy_logprobs_limit( - prompts, max_tokens, num_logprobs)) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = (hf_model.generate_greedy(prompts, max_tokens) + if num_logprobs is None else + hf_model.generate_greedy_logprobs_limit( + prompts, max_tokens, num_logprobs)) - if num_logprobs is None: - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - else: - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + if num_logprobs is None: + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + else: + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("model", MODELS) @@ -136,7 +137,7 @@ def test_multi_step_llm_w_prompt_logprobs( num_logprobs: Optional[int], num_prompt_logprobs: Optional[int], attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """Test prompt logprobs with multi-step scheduling via sync LLM Engine. @@ -166,47 +167,48 @@ def test_multi_step_llm_w_prompt_logprobs( note that this argument is not supported by the OpenAI completions endpoint. """ - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - prompts = example_prompts - if len(prompts) < num_prompts: - prompts = prompts * ((num_prompts // len(prompts)) + 1) - prompts = prompts[:num_prompts] - assert len(prompts) == num_prompts + prompts = example_prompts + if len(prompts) < num_prompts: + prompts = prompts * ((num_prompts // len(prompts)) + 1) + prompts = prompts[:num_prompts] + assert len(prompts) == num_prompts - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - num_scheduler_steps=num_scheduler_steps, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - prompts, - max_tokens, - num_logprobs, - num_prompt_logprobs=num_prompt_logprobs) + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + num_scheduler_steps=num_scheduler_steps, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs, + num_prompt_logprobs=num_prompt_logprobs) - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - ) as vllm_model: - single_step_vllm_outputs = vllm_model.generate_greedy_logprobs( - prompts, - max_tokens, - num_logprobs, - num_prompt_logprobs=num_prompt_logprobs) + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + ) as vllm_model: + single_step_vllm_outputs = vllm_model.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs, + num_prompt_logprobs=num_prompt_logprobs) - check_logprobs_close( - outputs_0_lst=single_step_vllm_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_logprobs_close( + outputs_0_lst=single_step_vllm_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("model", MODELS) @@ -230,7 +232,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache( num_prompts: int, num_logprobs: Optional[int], attention_backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """Test vLLM engine with multi-step+"single-step chunked prefill"+APC. @@ -293,77 +295,78 @@ def test_multi_step_llm_chunked_prefill_prefix_cache( # # The Incorrect scheduling behavior - if it occurs - will cause an exception # in the model runner resulting from `do_sample=False`. - override_backend_env_variable(monkeypatch, attention_backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, attention_backend) - assert len(example_prompts) >= 2 - challenge_prompts = copy.deepcopy(example_prompts) - challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient ' - 'inference and serving engine for LLMs.\n' - ) # 24 tok - challenge_prompts[1] = ( - 'Briefly describe the major milestones in the ' - 'development of artificial intelligence from 1950 to 2020.\n' - ) # 30 tok + assert len(example_prompts) >= 2 + challenge_prompts = copy.deepcopy(example_prompts) + challenge_prompts[0] = ( + 'vLLM is a high-throughput and memory-efficient ' + 'inference and serving engine for LLMs.\n') # 24 tok + challenge_prompts[1] = ( + 'Briefly describe the major milestones in the ' + 'development of artificial intelligence from 1950 to 2020.\n' + ) # 30 tok - # If necessary, adjust the length of `challenge_prompts` to match - # `num_prompts` - if len(challenge_prompts) < num_prompts: - challenge_prompts = (challenge_prompts * - ((num_prompts // len(challenge_prompts)) + 1)) - challenge_prompts = challenge_prompts[:num_prompts] - assert len(challenge_prompts) == num_prompts + # If necessary, adjust the length of `challenge_prompts` to match + # `num_prompts` + if len(challenge_prompts) < num_prompts: + challenge_prompts = (challenge_prompts * + ((num_prompts // len(challenge_prompts)) + 1)) + challenge_prompts = challenge_prompts[:num_prompts] + assert len(challenge_prompts) == num_prompts - # Single-step scheduler baseline - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - num_scheduler_steps=num_scheduler_steps, - max_model_len=48, - max_num_batched_tokens=48, - max_num_seqs=4, - block_size=16, - ) as vllm_model: - outputs_baseline = (vllm_model.generate_greedy( - challenge_prompts, max_tokens) if num_logprobs is None else - vllm_model.generate_greedy_logprobs( - challenge_prompts, max_tokens, num_logprobs)) + # Single-step scheduler baseline + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + num_scheduler_steps=num_scheduler_steps, + max_model_len=48, + max_num_batched_tokens=48, + max_num_seqs=4, + block_size=16, + ) as vllm_model: + outputs_baseline = ( + vllm_model.generate_greedy(challenge_prompts, max_tokens) if + num_logprobs is None else vllm_model.generate_greedy_logprobs( + challenge_prompts, max_tokens, num_logprobs)) - # multi-step+"single-step chunked prefill"+APC - with vllm_runner( - model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7, - tensor_parallel_size=tp_size, - enable_chunked_prefill=True, - enable_prefix_caching=True, - num_scheduler_steps=num_scheduler_steps, - max_model_len=48, - max_num_batched_tokens=48, - max_num_seqs=4, - block_size=16, - ) as vllm_model: - outputs_w_features = (vllm_model.generate_greedy( - challenge_prompts, max_tokens) if num_logprobs is None else - vllm_model.generate_greedy_logprobs( - challenge_prompts, max_tokens, num_logprobs)) + # multi-step+"single-step chunked prefill"+APC + with vllm_runner( + model, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7, + tensor_parallel_size=tp_size, + enable_chunked_prefill=True, + enable_prefix_caching=True, + num_scheduler_steps=num_scheduler_steps, + max_model_len=48, + max_num_batched_tokens=48, + max_num_seqs=4, + block_size=16, + ) as vllm_model: + outputs_w_features = ( + vllm_model.generate_greedy(challenge_prompts, max_tokens) if + num_logprobs is None else vllm_model.generate_greedy_logprobs( + challenge_prompts, max_tokens, num_logprobs)) - if num_logprobs is None: - # No-logprobs test - check_outputs_equal( - outputs_0_lst=outputs_baseline, - outputs_1_lst=outputs_w_features, - name_0="multi-step", - name_1="multi-step+features", - ) - else: - # Yes-logprobs test - check_logprobs_close( - outputs_0_lst=outputs_baseline, - outputs_1_lst=outputs_w_features, - name_0="multi-step", - name_1="multi-step+features", - ) + if num_logprobs is None: + # No-logprobs test + check_outputs_equal( + outputs_0_lst=outputs_baseline, + outputs_1_lst=outputs_w_features, + name_0="multi-step", + name_1="multi-step+features", + ) + else: + # Yes-logprobs test + check_logprobs_close( + outputs_0_lst=outputs_baseline, + outputs_1_lst=outputs_w_features, + name_0="multi-step", + name_1="multi-step+features", + ) diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py index 30dcdd573edf3..033a36b4156b0 100644 --- a/tests/neuron/1_core/test_block_table.py +++ b/tests/neuron/1_core/test_block_table.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -import os import neuronxcc.nki.language as nl import pytest @@ -99,6 +98,7 @@ def ref_block_tables_transform( ) @torch.inference_mode() def test_load_and_transform_block_tables( + monkeypatch: pytest.MonkeyPatch, num_tiles, num_blocks_per_tile, q_head_per_kv_head, @@ -108,46 +108,46 @@ def test_load_and_transform_block_tables( device = xm.xla_device() - compiler_flags = [ + compiler_flags_str = " ".join([ "-O1", "--retry_failed_compilation", - ] - compiler_flags_str = " ".join(compiler_flags) - os.environ["NEURON_CC_FLAGS"] = compiler_flags_str + ]) + with monkeypatch.context() as m: + m.setenv("NEURON_CC_FLAGS", compiler_flags_str) - torch.manual_seed(10000) - torch.set_printoptions(sci_mode=False) + torch.manual_seed(10000) + torch.set_printoptions(sci_mode=False) - # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient - B_P_SIZE = 128 - if num_blocks_per_tile < B_P_SIZE: - assert B_P_SIZE % num_blocks_per_tile == 0 - block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile - else: - block_size_tiling_factor = 1 - max_num_blocks = 100000 - block_tables = torch.randint( - 0, - max_num_blocks, - (num_tiles * num_blocks_per_tile, ), - dtype=torch.int32, - ) - nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1]( - block_tables.to(device=device), - num_tiles, - num_blocks_per_tile, - q_head_per_kv_head, - head_id, - block_size_tiling_factor, - ).cpu() - ref_out = ref_block_tables_transform( - block_tables, - num_tiles, - num_blocks_per_tile, - q_head_per_kv_head, - head_id, - block_size_tiling_factor, - ) - assert (nki_out.shape == ref_out.shape - ), f"{nki_out.shape=} != {ref_out.shape=}" - assert torch.all(nki_out == ref_out) + # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient + B_P_SIZE = 128 + if num_blocks_per_tile < B_P_SIZE: + assert B_P_SIZE % num_blocks_per_tile == 0 + block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile + else: + block_size_tiling_factor = 1 + max_num_blocks = 100000 + block_tables = torch.randint( + 0, + max_num_blocks, + (num_tiles * num_blocks_per_tile, ), + dtype=torch.int32, + ) + nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1]( + block_tables.to(device=device), + num_tiles, + num_blocks_per_tile, + q_head_per_kv_head, + head_id, + block_size_tiling_factor, + ).cpu() + ref_out = ref_block_tables_transform( + block_tables, + num_tiles, + num_blocks_per_tile, + q_head_per_kv_head, + head_id, + block_size_tiling_factor, + ) + assert (nki_out.shape == ref_out.shape + ), f"{nki_out.shape=} != {ref_out.shape=}" + assert torch.all(nki_out == ref_out) diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py index 326a1f82e9b30..37d6679f8d55b 100644 --- a/tests/neuron/1_core/test_prefix_prefill.py +++ b/tests/neuron/1_core/test_prefix_prefill.py @@ -320,6 +320,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size, ]) @torch.inference_mode() def test_contexted_kv_attention( + monkeypatch: pytest.MonkeyPatch, prefill_batch_size: int, decode_batch_size: int, num_heads: int, @@ -329,7 +330,6 @@ def test_contexted_kv_attention( large_tile_size, mixed_precision: bool, ) -> None: - import os import torch_xla.core.xla_model as xm @@ -340,174 +340,178 @@ def test_contexted_kv_attention( device = xm.xla_device() - compiler_flags = [ + compiler_flags_str = " ".join([ "-O1", "--retry_failed_compilation", - ] - compiler_flags_str = " ".join(compiler_flags) - os.environ["NEURON_CC_FLAGS"] = compiler_flags_str + ]) + with monkeypatch.context() as m: + m.setenv("NEURON_CC_FLAGS", compiler_flags_str) - torch.manual_seed(0) - torch.set_printoptions(sci_mode=False) - torch.set_default_device("cpu") - dtype = torch.float32 + torch.manual_seed(0) + torch.set_printoptions(sci_mode=False) + torch.set_default_device("cpu") + dtype = torch.float32 - min_ctx_len = 32 - max_ctx_len = 1024 - min_query_len = 16 - max_query_len = 512 - num_kv_heads = num_heads // num_queries_per_kv - ( - query, - k_active, - v_active, - k_cache, - v_cache, - block_table, - key, - value, - query_lens, - seq_lens, - ) = sample_inputs( - prefill_batch_size=prefill_batch_size, - decode_batch_size=decode_batch_size, - min_query_len=min_query_len, - max_query_len=max_query_len, - min_ctx_len=min_ctx_len, - max_ctx_len=max_ctx_len, - block_size=block_size, - num_heads=num_heads, - num_kv_heads=num_kv_heads, - head_size=head_size, - dtype=dtype, - ) + min_ctx_len = 32 + max_ctx_len = 1024 + min_query_len = 16 + max_query_len = 512 + num_kv_heads = num_heads // num_queries_per_kv + ( + query, + k_active, + v_active, + k_cache, + v_cache, + block_table, + key, + value, + query_lens, + seq_lens, + ) = sample_inputs( + prefill_batch_size=prefill_batch_size, + decode_batch_size=decode_batch_size, + min_query_len=min_query_len, + max_query_len=max_query_len, + min_ctx_len=min_ctx_len, + max_ctx_len=max_ctx_len, + block_size=block_size, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + head_size=head_size, + dtype=dtype, + ) - output_ref = ref_context_attention( - query, - key, - value, - query_lens, - seq_lens, - head_size, - num_queries_per_kv, - return_max_reduce=False, - ) + output_ref = ref_context_attention( + query, + key, + value, + query_lens, + seq_lens, + head_size, + num_queries_per_kv, + return_max_reduce=False, + ) - # build neuron program - B_P_SIZE = 128 - assert (large_tile_size >= B_P_SIZE - ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}" + # build neuron program + B_P_SIZE = 128 + assert (large_tile_size >= B_P_SIZE + ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}" - def ceil_div(a, b): - return (a + b - 1) // b + def ceil_div(a, b): + return (a + b - 1) // b - def pad_to_multiple(a, b): - return ceil_div(a, b) * b + def pad_to_multiple(a, b): + return ceil_div(a, b) * b - def pad_to_next_power_of_2(a): - assert a > 0 - return 2**int(a - 1).bit_length() + def pad_to_next_power_of_2(a): + assert a > 0 + return 2**int(a - 1).bit_length() - # calculate input shapes - max_num_queries = pad_to_next_power_of_2(sum(query_lens)) - context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) - num_active_blocks = ceil_div(context_lens, block_size).sum().item() - num_active_blocks = pad_to_multiple(num_active_blocks, - large_tile_size // block_size) - context_kv_len = num_active_blocks * block_size - assert (context_kv_len % + # calculate input shapes + max_num_queries = pad_to_next_power_of_2(sum(query_lens)) + context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) + num_active_blocks = ceil_div(context_lens, block_size).sum().item() + num_active_blocks = pad_to_multiple(num_active_blocks, + large_tile_size // block_size) + context_kv_len = num_active_blocks * block_size + assert ( + context_kv_len % large_tile_size == 0), f"invalid context_kv_len={context_kv_len}" - # pad QKV tensors - pad_dims = ( - 0, - 0, - 0, - 0, - 0, - max_num_queries - query.shape[0], - ) - query = F.pad(query, pad_dims, "constant", 0) - k = F.pad(k_active, pad_dims, "constant", 0) - v = F.pad(v_active, pad_dims, "constant", 0) - - # permute QKV tensors - # query: (1, n_heads, d, seq_q) - # key: (1, n_kv_heads, d, seq_k) - # value: (1, n_kv_heads, seq_v, d) - query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous() - k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous() - v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous() - k_cache = k_cache.permute(0, 2, 1, 3).contiguous() - v_cache = v_cache.permute(0, 2, 1, 3).contiguous() - - # transform block table - active_block_table = get_active_block_tables( - block_table.cpu(), - torch.tensor(query_lens).cpu(), - torch.tensor(seq_lens).cpu(), - block_size, - num_active_blocks, - ) - - # Build attention masks - prior_mask, active_mask = ( - BlockDiagonalCausalFromBottomRightMask.from_seqlens( - query_lens, seq_lens, block_size=block_size)) - prior_mask_padded = F.pad( - prior_mask, - ( + # pad QKV tensors + pad_dims = ( 0, - context_kv_len - prior_mask.shape[1], 0, - max_num_queries - prior_mask.shape[0], - ), - "constant", - 0, - ).bool() - active_mask_padded = F.pad( - active_mask, - ( 0, - max_num_queries - active_mask.shape[1], 0, - max_num_queries - active_mask.shape[0], - ), - "constant", - 0, - ).bool() - attn_mask = torch.concat([prior_mask_padded, active_mask_padded], dim=1) + 0, + max_num_queries - query.shape[0], + ) + query = F.pad(query, pad_dims, "constant", 0) + k = F.pad(k_active, pad_dims, "constant", 0) + v = F.pad(v_active, pad_dims, "constant", 0) - attn_mask = reorder_context_mask(attn_mask, large_tile_size, block_size) + # permute QKV tensors + # query: (1, n_heads, d, seq_q) + # key: (1, n_kv_heads, d, seq_k) + # value: (1, n_kv_heads, seq_v, d) + query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous() + k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous() + v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous() + k_cache = k_cache.permute(0, 2, 1, 3).contiguous() + v_cache = v_cache.permute(0, 2, 1, 3).contiguous() - input_args = ( - query.to(device=device), - k.to(device=device), - v.to(device=device), - k_cache.to(device=device), - v_cache.to(device=device), - active_block_table.to(device=device), - attn_mask.to(device=device), - ) - input_kwargs = dict( - n_kv_head=num_kv_heads, - head_size=head_size, - mixed_precision=mixed_precision, - LARGE_TILE_SZ=large_tile_size, - ) + # transform block table + active_block_table = get_active_block_tables( + block_table.cpu(), + torch.tensor(query_lens).cpu(), + torch.tensor(seq_lens).cpu(), + block_size, + num_active_blocks, + ) - output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs) + # Build attention masks + prior_mask, active_mask = ( + BlockDiagonalCausalFromBottomRightMask.from_seqlens( + query_lens, seq_lens, block_size=block_size)) + prior_mask_padded = F.pad( + prior_mask, + ( + 0, + context_kv_len - prior_mask.shape[1], + 0, + max_num_queries - prior_mask.shape[0], + ), + "constant", + 0, + ).bool() + active_mask_padded = F.pad( + active_mask, + ( + 0, + max_num_queries - active_mask.shape[1], + 0, + max_num_queries - active_mask.shape[0], + ), + "constant", + 0, + ).bool() + attn_mask = torch.concat([prior_mask_padded, active_mask_padded], + dim=1) - num_actual_tokens = sum(query_lens) - # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d) - output_nki = output_nki.cpu().permute(0, 2, 1, 3) - output_nki = output_nki[0, :num_actual_tokens, :, :] - output_ref_padded = F.pad( - output_ref, - (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]), - "constant", - 0, - ) - output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :] + attn_mask = reorder_context_mask(attn_mask, large_tile_size, + block_size) - torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0) + input_args = ( + query.to(device=device), + k.to(device=device), + v.to(device=device), + k_cache.to(device=device), + v_cache.to(device=device), + active_block_table.to(device=device), + attn_mask.to(device=device), + ) + input_kwargs = dict( + n_kv_head=num_kv_heads, + head_size=head_size, + mixed_precision=mixed_precision, + LARGE_TILE_SZ=large_tile_size, + ) + + output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs) + + num_actual_tokens = sum(query_lens) + # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d) + output_nki = output_nki.cpu().permute(0, 2, 1, 3) + output_nki = output_nki[0, :num_actual_tokens, :, :] + output_ref_padded = F.pad( + output_ref, + (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]), + "constant", + 0, + ) + output_ref = output_ref_padded.transpose( + 0, 1)[0, :num_actual_tokens, :, :] + + torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0) diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index 3be248f5aca45..9d6872e0e0772 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 +import pytest import torch -from tests.kernels.utils import override_backend_env_variable from vllm.attention.selector import get_attn_backend -from vllm.utils import STR_INVALID_VAL +from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL def test_platform_plugins(): @@ -25,8 +25,9 @@ def test_platform_plugins(): f" is loaded. The first import:\n{_init_trace}") -def test_oot_attention_backend(monkeypatch): +def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch): # ignore the backend env variable if it is set - override_backend_env_variable(monkeypatch, STR_INVALID_VAL) - backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) - assert backend.get_name() == "Dummy_Backend" + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) + backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) + assert backend.get_name() == "Dummy_Backend" diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py index 98981a81e909c..7abf5066a4133 100644 --- a/tests/plugins_tests/test_scheduler_plugins.py +++ b/tests/plugins_tests/test_scheduler_plugins.py @@ -22,43 +22,47 @@ class DummyV1Scheduler(V1Scheduler): raise Exception("Exception raised by DummyV1Scheduler") -def test_scheduler_plugins_v0(monkeypatch): - monkeypatch.setenv("VLLM_USE_V1", "0") - with pytest.raises(Exception) as exception_info: +def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + with pytest.raises(Exception) as exception_info: - engine_args = EngineArgs( - model="facebook/opt-125m", - enforce_eager=True, # reduce test time - scheduler_cls=DummyV0Scheduler, - ) + engine_args = EngineArgs( + model="facebook/opt-125m", + enforce_eager=True, # reduce test time + scheduler_cls=DummyV0Scheduler, + ) - engine = LLMEngine.from_engine_args(engine_args=engine_args) + engine = LLMEngine.from_engine_args(engine_args=engine_args) - sampling_params = SamplingParams(max_tokens=1) - engine.add_request("0", "foo", sampling_params) - engine.step() + sampling_params = SamplingParams(max_tokens=1) + engine.add_request("0", "foo", sampling_params) + engine.step() - assert str(exception_info.value) == "Exception raised by DummyV0Scheduler" + assert str( + exception_info.value) == "Exception raised by DummyV0Scheduler" -def test_scheduler_plugins_v1(monkeypatch): - monkeypatch.setenv("VLLM_USE_V1", "1") - # Explicitly turn off engine multiprocessing so that the scheduler runs in - # this process - monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") +def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + # Explicitly turn off engine multiprocessing so + # that the scheduler runs in this process + m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") - with pytest.raises(Exception) as exception_info: + with pytest.raises(Exception) as exception_info: - engine_args = EngineArgs( - model="facebook/opt-125m", - enforce_eager=True, # reduce test time - scheduler_cls=DummyV1Scheduler, - ) + engine_args = EngineArgs( + model="facebook/opt-125m", + enforce_eager=True, # reduce test time + scheduler_cls=DummyV1Scheduler, + ) - engine = V1LLMEngine.from_engine_args(engine_args=engine_args) + engine = V1LLMEngine.from_engine_args(engine_args=engine_args) - sampling_params = SamplingParams(max_tokens=1) - engine.add_request("0", "foo", sampling_params) - engine.step() + sampling_params = SamplingParams(max_tokens=1) + engine.add_request("0", "foo", sampling_params) + engine.step() - assert str(exception_info.value) == "Exception raised by DummyV1Scheduler" + assert str( + exception_info.value) == "Exception raised by DummyV1Scheduler" diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 7a4bc7aecc0f4..607b6c43e02e2 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -4,25 +4,29 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`. """ +from __future__ import annotations + import pytest from tests.conftest import VllmRunner from tests.core.utils import SchedulerProxy, create_dummy_prompt -from tests.kernels.utils import override_backend_env_variable from vllm import SamplingParams, TokensPrompt from vllm.core.scheduler import Scheduler from vllm.engine.llm_engine import LLMEngine from vllm.platforms import current_platform +from vllm.utils import STR_BACKEND_ENV_VAR from ..models.utils import check_outputs_equal @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): +def use_v0_only(monkeypatch: pytest.MonkeyPatch): """ This module relies on V0 internals, so set VLLM_USE_V1=0. """ - monkeypatch.setenv('VLLM_USE_V1', '0') + with monkeypatch.context() as m: + m.setenv('VLLM_USE_V1', '0') + yield MODELS = [ @@ -56,7 +60,7 @@ def test_mixed_requests( cached_position: int, enable_chunked_prefill: bool, block_size: int, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: """ Test the case when some sequences have the prefix cache hit @@ -67,72 +71,77 @@ def test_mixed_requests( pytest.skip("Flashinfer does not support ROCm/HIP.") if backend == "XFORMERS" and current_platform.is_rocm(): pytest.skip("Xformers does not support ROCm/HIP.") - override_backend_env_variable(monkeypatch, backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, backend) - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - cached_prompt = example_prompts[cached_position] - with vllm_runner( - model, - dtype=dtype, - enable_prefix_caching=True, - enable_chunked_prefill=enable_chunked_prefill, - block_size=block_size, - ) as vllm_model: - # Run the first prompt so the cache is populated - vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens) + cached_prompt = example_prompts[cached_position] + with vllm_runner( + model, + dtype=dtype, + enable_prefix_caching=True, + enable_chunked_prefill=enable_chunked_prefill, + block_size=block_size, + ) as vllm_model: + # Run the first prompt so the cache is populated + vllm_outputs = vllm_model.generate_greedy([cached_prompt], + max_tokens) - # Run all the promopts - greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) - req_outputs = vllm_model.model.generate(example_prompts, greedy_params) + # Run all the promopts + greedy_params = SamplingParams(temperature=0.0, + max_tokens=max_tokens) + req_outputs = vllm_model.model.generate(example_prompts, + greedy_params) - # Verify number of cached tokens - for i in range(len(req_outputs)): - if i == cached_position: - expected_num_cached_tokens = ( - len(req_outputs[i].prompt_token_ids) // - block_size) * block_size - else: - expected_num_cached_tokens = 0 - assert ( - req_outputs[i].num_cached_tokens == expected_num_cached_tokens) + # Verify number of cached tokens + for i in range(len(req_outputs)): + if i == cached_position: + expected_num_cached_tokens = ( + len(req_outputs[i].prompt_token_ids) // + block_size) * block_size + else: + expected_num_cached_tokens = 0 + assert (req_outputs[i].num_cached_tokens == + expected_num_cached_tokens) - vllm_outputs = [( - output.prompt_token_ids + list(output.outputs[0].token_ids), - output.prompt + output.outputs[0].text, - ) for output in req_outputs] + vllm_outputs = [( + output.prompt_token_ids + list(output.outputs[0].token_ids), + output.prompt + output.outputs[0].text, + ) for output in req_outputs] - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) def test_unstable_prompt_sequence( vllm_runner, backend: str, - monkeypatch, + monkeypatch: pytest.MonkeyPatch, ) -> None: if backend == "FLASHINFER" and current_platform.is_rocm(): pytest.skip("Flashinfer does not support ROCm/HIP.") if backend == "XFORMERS" and current_platform.is_rocm(): pytest.skip("Xformers does not support ROCm/HIP.") - override_backend_env_variable(monkeypatch, backend) + with monkeypatch.context() as m: + m.setenv(STR_BACKEND_ENV_VAR, backend) - with vllm_runner( - "Qwen/Qwen2.5-0.5B-Instruct", - enable_chunked_prefill=True, - enable_prefix_caching=True, - max_model_len=4096, - ) as vllm_model: - for prompt in UNSTABLE_PROMPT_SEQUENCE: - vllm_model.generate(TokensPrompt(prompt_token_ids=prompt), - SamplingParams(max_tokens=1)) + with vllm_runner( + "Qwen/Qwen2.5-0.5B-Instruct", + enable_chunked_prefill=True, + enable_prefix_caching=True, + max_model_len=4096, + ) as vllm_model: + for prompt in UNSTABLE_PROMPT_SEQUENCE: + vllm_model.generate(TokensPrompt(prompt_token_ids=prompt), + SamplingParams(max_tokens=1)) @pytest.mark.parametrize("model", MODELS) diff --git a/tests/test_regression.py b/tests/test_regression.py index b54dc6af3e9a6..8c9d4a91c73be 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -56,12 +56,11 @@ def test_gc(): assert allocated < 50 * 1024 * 1024 -def test_model_from_modelscope(monkeypatch): +def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch): # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary - MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat" - monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True") - try: - llm = LLM(model=MODELSCOPE_MODEL_NAME) + with monkeypatch.context() as m: + m.setenv("VLLM_USE_MODELSCOPE", "True") + llm = LLM(model="qwen/Qwen1.5-0.5B-Chat") prompts = [ "Hello, my name is", @@ -73,10 +72,3 @@ def test_model_from_modelscope(monkeypatch): outputs = llm.generate(prompts, sampling_params) assert len(outputs) == 4 - finally: - monkeypatch.delenv("VLLM_USE_MODELSCOPE", raising=False) - - -if __name__ == "__main__": - import pytest - pytest.main([__file__]) diff --git a/tests/test_utils.py b/tests/test_utils.py index dcca7d5965e9e..ae4fddd046d45 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +# ruff: noqa import asyncio -import os import socket from collections.abc import AsyncIterator from unittest.mock import patch @@ -112,16 +112,16 @@ def test_deprecate_kwargs_additional_message(): dummy(old_arg=1) -def test_get_open_port(): - os.environ["VLLM_PORT"] = "5678" - # make sure we can get multiple ports, even if the env var is set - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1: - s1.bind(("localhost", get_open_port())) - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2: - s2.bind(("localhost", get_open_port())) - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3: - s3.bind(("localhost", get_open_port())) - os.environ.pop("VLLM_PORT") +def test_get_open_port(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_PORT", "5678") + # make sure we can get multiple ports, even if the env var is set + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1: + s1.bind(("localhost", get_open_port())) + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2: + s2.bind(("localhost", get_open_port())) + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3: + s3.bind(("localhost", get_open_port())) # Tests for FlexibleArgumentParser @@ -366,31 +366,32 @@ def test_bind_kv_cache_non_attention(): assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1] -def test_bind_kv_cache_encoder_decoder(monkeypatch): +def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch): # V1 TESTS: ENCODER_DECODER is not supported on V1 yet. - monkeypatch.setenv("VLLM_USE_V1", "0") + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") - from vllm.attention import Attention, AttentionType + from vllm.attention import Attention, AttentionType - # example from bart - ctx = { - 'encoder.layers.0.self_attn.attn': - Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER), - 'decoder.layers.0.encoder_attn.attn': - Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER), - 'decoder.layers.0.self_attn.attn': - Attention(32, 128, 0.1, attn_type=AttentionType.DECODER), - } + # example from bart + ctx = { + 'encoder.layers.0.self_attn.attn': + Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER), + 'decoder.layers.0.encoder_attn.attn': + Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER), + 'decoder.layers.0.self_attn.attn': + Attention(32, 128, 0.1, attn_type=AttentionType.DECODER), + } - kv_cache = [ - torch.zeros((1, )), - ] - encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache + kv_cache = [ + torch.zeros((1, )), + ] + encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache - bind_kv_cache(ctx, [kv_cache]) - assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache - assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0] - assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0] + bind_kv_cache(ctx, [kv_cache]) + assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache + assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0] + assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0] def test_bind_kv_cache_pp(): diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py index e94bbd2877225..f7a59f054b61b 100644 --- a/tests/tpu/test_custom_dispatcher.py +++ b/tests/tpu/test_custom_dispatcher.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -import os +import pytest from vllm.config import CompilationLevel @@ -9,16 +9,17 @@ from ..utils import compare_two_settings # --enforce-eager on TPU causes graph compilation # this times out default Health Check in the MQLLMEngine, # so we set the timeout here to 30s -os.environ["VLLM_RPC_TIMEOUT"] = "30000" -def test_custom_dispatcher(): - compare_two_settings( - "google/gemma-2b", - arg1=[ - "--enforce-eager", - f"-O{CompilationLevel.DYNAMO_ONCE}", - ], - arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"], - env1={}, - env2={}) +def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_RPC_TIMEOUT", "30000") + compare_two_settings( + "google/gemma-2b", + arg1=[ + "--enforce-eager", + f"-O{CompilationLevel.DYNAMO_ONCE}", + ], + arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"], + env1={}, + env2={}) diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index 97149884497af..a781b8b563be1 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -1,10 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 +# ruff: noqa +# type: ignore +from __future__ import annotations -import os import threading from collections.abc import Iterable from concurrent import futures -from typing import Callable, Literal +from typing import Callable, Generator, Literal import grpc import pytest @@ -21,12 +23,14 @@ from vllm.tracing import SpanAttributes @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): +def use_v0_only(monkeypatch: pytest.MonkeyPatch): """ Since this module is V0 only, set VLLM_USE_V1=0 for all tests in the module. """ - monkeypatch.setenv('VLLM_USE_V1', '0') + with monkeypatch.context() as m: + m.setenv('VLLM_USE_V1', '0') + yield FAKE_TRACE_SERVER_ADDRESS = "localhost:4317" @@ -67,7 +71,7 @@ class FakeTraceService(TraceServiceServicer): @pytest.fixture -def trace_service(): +def trace_service() -> Generator[FakeTraceService, None, None]: """Fixture to set up a fake gRPC trace service""" server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) service = FakeTraceService() @@ -80,136 +84,153 @@ def trace_service(): server.stop(None) -def test_traces(trace_service): - os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true" +def test_traces( + monkeypatch: pytest.MonkeyPatch, + trace_service: FakeTraceService, +): + with monkeypatch.context() as m: + m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true") - sampling_params = SamplingParams(temperature=0.01, - top_p=0.1, - max_tokens=256) - model = "facebook/opt-125m" - llm = LLM( - model=model, - otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, - ) - prompts = ["This is a short prompt"] - outputs = llm.generate(prompts, sampling_params=sampling_params) + sampling_params = SamplingParams( + temperature=0.01, + top_p=0.1, + max_tokens=256, + ) + model = "facebook/opt-125m" + llm = LLM( + model=model, + otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, + ) + prompts = ["This is a short prompt"] + outputs = llm.generate(prompts, sampling_params=sampling_params) - timeout = 5 - if not trace_service.evt.wait(timeout): - raise TimeoutError( - f"The fake trace service didn't receive a trace within " - f"the {timeout} seconds timeout") + timeout = 5 + if not trace_service.evt.wait(timeout): + raise TimeoutError( + f"The fake trace service didn't receive a trace within " + f"the {timeout} seconds timeout") - request = trace_service.request - assert len(request.resource_spans) == 1, ( - f"Expected 1 resource span, " - f"but got {len(request.resource_spans)}") - assert len(request.resource_spans[0].scope_spans) == 1, ( - f"Expected 1 scope span, " - f"but got {len(request.resource_spans[0].scope_spans)}") - assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( - f"Expected 1 span, " - f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") + request = trace_service.request + assert len(request.resource_spans) == 1, ( + f"Expected 1 resource span, " + f"but got {len(request.resource_spans)}") + assert len(request.resource_spans[0].scope_spans) == 1, ( + f"Expected 1 scope span, " + f"but got {len(request.resource_spans[0].scope_spans)}") + assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( + f"Expected 1 span, " + f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") - attributes = decode_attributes( - request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE - ) == sampling_params.temperature - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n - assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( - outputs[0].prompt_token_ids) - completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) - assert attributes.get( - SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens - metrics = outputs[0].metrics - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue - ttft = metrics.first_token_time - metrics.arrival_time - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft - e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time - assert metrics.scheduler_time > 0 - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER - ) == metrics.scheduler_time - # Model forward and model execute should be none, since detailed traces is - # not enabled. - assert metrics.model_forward_time is None - assert metrics.model_execute_time is None + attributes = decode_attributes( + request.resource_spans[0].scope_spans[0].spans[0].attributes) + assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE + ) == sampling_params.temperature + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS + ) == sampling_params.max_tokens + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n + assert attributes.get( + SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( + outputs[0].prompt_token_ids) + completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) + assert attributes.get( + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens + metrics = outputs[0].metrics + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE + ) == metrics.time_in_queue + ttft = metrics.first_token_time - metrics.arrival_time + assert attributes.get( + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft + e2e_time = metrics.finished_time - metrics.arrival_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time + assert metrics.scheduler_time > 0 + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER + ) == metrics.scheduler_time + # Model forward and model execute should be none, since detailed traces is + # not enabled. + assert metrics.model_forward_time is None + assert metrics.model_execute_time is None -def test_traces_with_detailed_steps(trace_service): - os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true" +def test_traces_with_detailed_steps( + monkeypatch: pytest.MonkeyPatch, + trace_service: FakeTraceService, +): + with monkeypatch.context() as m: + m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true") - sampling_params = SamplingParams(temperature=0.01, - top_p=0.1, - max_tokens=256) - model = "facebook/opt-125m" - llm = LLM( - model=model, - otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, - collect_detailed_traces="all", - ) - prompts = ["This is a short prompt"] - outputs = llm.generate(prompts, sampling_params=sampling_params) + sampling_params = SamplingParams( + temperature=0.01, + top_p=0.1, + max_tokens=256, + ) + model = "facebook/opt-125m" + llm = LLM( + model=model, + otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, + collect_detailed_traces="all", + ) + prompts = ["This is a short prompt"] + outputs = llm.generate(prompts, sampling_params=sampling_params) - timeout = 5 - if not trace_service.evt.wait(timeout): - raise TimeoutError( - f"The fake trace service didn't receive a trace within " - f"the {timeout} seconds timeout") + timeout = 5 + if not trace_service.evt.wait(timeout): + raise TimeoutError( + f"The fake trace service didn't receive a trace within " + f"the {timeout} seconds timeout") - request = trace_service.request - assert len(request.resource_spans) == 1, ( - f"Expected 1 resource span, " - f"but got {len(request.resource_spans)}") - assert len(request.resource_spans[0].scope_spans) == 1, ( - f"Expected 1 scope span, " - f"but got {len(request.resource_spans[0].scope_spans)}") - assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( - f"Expected 1 span, " - f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") + request = trace_service.request + assert len(request.resource_spans) == 1, ( + f"Expected 1 resource span, " + f"but got {len(request.resource_spans)}") + assert len(request.resource_spans[0].scope_spans) == 1, ( + f"Expected 1 scope span, " + f"but got {len(request.resource_spans[0].scope_spans)}") + assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( + f"Expected 1 span, " + f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") - attributes = decode_attributes( - request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE - ) == sampling_params.temperature - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get( - SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens - assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n - assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( - outputs[0].prompt_token_ids) - completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) - assert attributes.get( - SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens - metrics = outputs[0].metrics - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue - ttft = metrics.first_token_time - metrics.arrival_time - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft - e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time - assert metrics.scheduler_time > 0 - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER - ) == metrics.scheduler_time - assert metrics.model_forward_time > 0 - assert attributes.get( - SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx( - metrics.model_forward_time / 1000) - assert metrics.model_execute_time > 0 - assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE - ) == metrics.model_execute_time - assert metrics.model_forward_time < 1000 * metrics.model_execute_time + attributes = decode_attributes( + request.resource_spans[0].scope_spans[0].spans[0].attributes) + assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE + ) == sampling_params.temperature + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS + ) == sampling_params.max_tokens + assert attributes.get( + SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n + assert attributes.get( + SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( + outputs[0].prompt_token_ids) + completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) + assert attributes.get( + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens + metrics = outputs[0].metrics + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE + ) == metrics.time_in_queue + ttft = metrics.first_token_time - metrics.arrival_time + assert attributes.get( + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft + e2e_time = metrics.finished_time - metrics.arrival_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time + assert metrics.scheduler_time > 0 + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER + ) == metrics.scheduler_time + assert metrics.model_forward_time > 0 + assert attributes.get( + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD + ) == pytest.approx(metrics.model_forward_time / 1000) + assert metrics.model_execute_time > 0 + assert attributes.get( + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE + ) == metrics.model_execute_time + assert metrics.model_forward_time < 1000 * metrics.model_execute_time diff --git a/tests/utils.py b/tests/utils.py index fc19c8d031b16..06ba8a2421c16 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -566,6 +566,7 @@ def init_test_distributed_environment( def multi_process_parallel( + monkeypatch: pytest.MonkeyPatch, tp_size: int, pp_size: int, test_target: Any, @@ -582,7 +583,13 @@ def multi_process_parallel( refs = [] for rank in range(tp_size * pp_size): refs.append( - test_target.remote(tp_size, pp_size, rank, distributed_init_port)) + test_target.remote( + monkeypatch, + tp_size, + pp_size, + rank, + distributed_init_port, + ), ) ray.get(refs) ray.shutdown() @@ -700,7 +707,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator: """ Get a pytest mark, which skips the test if the GPU doesn't meet a minimum memory requirement in GB. - + This can be leveraged via `@large_gpu_test` to skip tests in environments without enough resources, or called when filtering tests to run directly. """ diff --git a/tests/v1/e2e/test_ngram_spec_decode.py b/tests/v1/e2e/test_ngram_spec_decode.py index 519a74cab84bc..6cca324514565 100644 --- a/tests/v1/e2e/test_ngram_spec_decode.py +++ b/tests/v1/e2e/test_ngram_spec_decode.py @@ -1,5 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import random +from typing import Any import pytest @@ -50,8 +53,12 @@ def model_name(): return "meta-llama/Meta-Llama-3-8B-Instruct" -def test_ngram_correctness(monkeypatch, test_prompts, sampling_config, - model_name): +def test_ngram_correctness( + monkeypatch: pytest.MonkeyPatch, + test_prompts: list[list[dict[str, Any]]], + sampling_config: SamplingParams, + model_name: str, +): ''' Compare the outputs of a original LLM and a speculative LLM should be the same when using ngram speculative decoding. diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 5b9725d59ddc5..0ff804976ada6 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -80,9 +80,11 @@ async def generate(engine: AsyncLLM, [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)]) @pytest.mark.asyncio -async def test_load(monkeypatch, output_kind: RequestOutputKind, - engine_args_and_prompt: tuple[AsyncEngineArgs, - PromptType]): +async def test_load( + monkeypatch: pytest.MonkeyPatch, + output_kind: RequestOutputKind, + engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType], +): # TODO(rickyx): Remove monkeypatch once we have a better way to test V1 # so that in the future when we switch, we don't have to change all the # tests. @@ -126,7 +128,8 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind, [(TEXT_ENGINE_ARGS, TEXT_PROMPT), (VISION_ENGINE_ARGS, VISION_PROMPT)]) @pytest.mark.asyncio -async def test_abort(monkeypatch, output_kind: RequestOutputKind, +async def test_abort(monkeypatch: pytest.MonkeyPatch, + output_kind: RequestOutputKind, engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType]): diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 5fdbcf5b99636..2ec4f7e034af8 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -45,7 +45,7 @@ def make_request() -> EngineCoreRequest: @fork_new_process_for_each_test -def test_engine_core(monkeypatch): +def test_engine_core(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") @@ -159,10 +159,10 @@ def test_engine_core(monkeypatch): @fork_new_process_for_each_test -def test_engine_core_advanced_sampling(monkeypatch): +def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch): """ - A basic end-to-end test to verify that the engine functions correctly - when additional sampling parameters, such as top_p, min_tokens, and + A basic end-to-end test to verify that the engine functions correctly + when additional sampling parameters, such as top_p, min_tokens, and presence_penalty, are set. """ with monkeypatch.context() as m: @@ -209,7 +209,7 @@ def test_engine_core_advanced_sampling(monkeypatch): @fork_new_process_for_each_test -def test_engine_core_concurrent_batches(monkeypatch): +def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): """ Test that the engine can handle multiple concurrent batches. """ diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index e646ccbd46030..004b4dc82f4d9 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -90,7 +90,8 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str: @fork_new_process_for_each_test @pytest.mark.parametrize("multiprocessing_mode", [True, False]) -def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): +def test_engine_core_client(monkeypatch: pytest.MonkeyPatch, + multiprocessing_mode: bool): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") @@ -175,7 +176,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): @pytest.mark.asyncio(loop_scope="function") -async def test_engine_core_client_asyncio(monkeypatch): +async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index e763aa2c86998..3800cb392fbad 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -57,7 +57,7 @@ def _repeat_logprob_config( logprob_prompt_logprob_list: BatchLogprobsSpecType, ) -> BatchLogprobsSpecType: """Ensure each test prompt has a logprob config. - + A logprob config specifies the optional (i.e. may-be-`None`) number of sample logprobs and the optional number of prompt logprobs. @@ -80,7 +80,7 @@ def _repeat_logprob_config( (optional num sample logprob, optional num prompt logprob) tuples - + Returns: list of (optional num sample logprob,optional num prompt logprob) @@ -255,14 +255,12 @@ def _run_and_validate( [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT]) @pytest.mark.parametrize("temperature", [0.0, 2.0]) def test_get_logprobs_and_prompt_logprobs( - hf_model, - vllm_model, - batch_logprobs_composition: BatchLogprobsComposition, - temperature: float, - example_prompts, -) -> None: + hf_model, vllm_model, + batch_logprobs_composition: BatchLogprobsComposition, + temperature: float, example_prompts: list[str], + monkeypatch: pytest.MonkeyPatch) -> None: """Test V1 Engine logprobs & prompt logprobs - + Exercise a variety of combinations of `logprobs` and `prompt_logprobs` settings and validate that * The generated logprobs and prompt logprobs are consistent with the @@ -279,7 +277,7 @@ def test_get_logprobs_and_prompt_logprobs( To save time, only test one APC-enabled scenario (sample & prompt logprobs enabled, temperature>0.0). - + Args: hf_model: HuggingFace reference model fixture vllm_model: vLLM model fixture @@ -287,128 +285,140 @@ def test_get_logprobs_and_prompt_logprobs( temperature: "temperature" sampling parameter example_prompts: example prompt fixture """ - do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching - if do_apc and (temperature < 2.0 - or batch_logprobs_composition != SAMPLE_PROMPT): - # Skip some test-cases to save time. - pytest.skip() - test_prompts = example_prompts + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching + if do_apc and (temperature < 2.0 + or batch_logprobs_composition != SAMPLE_PROMPT): + # Skip some test-cases to save time. + pytest.skip() + test_prompts = example_prompts - max_tokens = 5 - hf_outputs = hf_model.generate_greedy( - test_prompts, - max_tokens=max_tokens, - ) - hf_logprobs = hf_model.generate_greedy_logprobs( - test_prompts, - max_tokens=max_tokens, - ) - - # Batch has mixed sample params - # (different logprobs/prompt logprobs combos) - logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition) - - # Ensure that each test prompt has a logprob config for testing - logprob_prompt_logprob_list = _repeat_logprob_config( - test_prompts, logprob_prompt_logprob_list) - # Generate SamplingParams - vllm_sampling_params = [ - SamplingParams(max_tokens=max_tokens, - logprobs=num_lp, - prompt_logprobs=num_plp, - temperature=temperature, - seed=1984) - for num_lp, num_plp in logprob_prompt_logprob_list - ] - for _ in range(2 if do_apc else 1): - _run_and_validate( - vllm_model=vllm_model, - test_prompts=test_prompts, - vllm_sampling_params=vllm_sampling_params, - hf_logprobs=hf_logprobs, - hf_outputs=hf_outputs, - logprob_prompt_logprob_list=logprob_prompt_logprob_list, - temperature=temperature, + max_tokens = 5 + hf_outputs = hf_model.generate_greedy( + test_prompts, max_tokens=max_tokens, - do_apc=do_apc) + ) + hf_logprobs = hf_model.generate_greedy_logprobs( + test_prompts, + max_tokens=max_tokens, + ) + + # Batch has mixed sample params + # (different logprobs/prompt logprobs combos) + logprob_prompt_logprob_list = get_test_batch( + batch_logprobs_composition) + + # Ensure that each test prompt has a logprob config for testing + logprob_prompt_logprob_list = _repeat_logprob_config( + test_prompts, logprob_prompt_logprob_list) + # Generate SamplingParams + vllm_sampling_params = [ + SamplingParams(max_tokens=max_tokens, + logprobs=num_lp, + prompt_logprobs=num_plp, + temperature=temperature, + seed=1984) + for num_lp, num_plp in logprob_prompt_logprob_list + ] + for _ in range(2 if do_apc else 1): + _run_and_validate( + vllm_model=vllm_model, + test_prompts=test_prompts, + vllm_sampling_params=vllm_sampling_params, + hf_logprobs=hf_logprobs, + hf_outputs=hf_outputs, + logprob_prompt_logprob_list=logprob_prompt_logprob_list, + temperature=temperature, + max_tokens=max_tokens, + do_apc=do_apc) -def test_max_logprobs(): +def test_max_logprobs(monkeypatch: pytest.MonkeyPatch): """vLLM v1 engine should fail a request with `logprobs > max_logprobs` - Should also fail for `prompt_logprobs > max_logprobs` - APC should not matter as this test checks basic request validation. - - Args: - monkeypatch """ + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") - runner = VllmRunner("facebook/opt-125m", - max_logprobs=1, - enable_prefix_caching=False, - max_model_len=256) - vllm_sampling_params = SamplingParams(logprobs=1) - # should pass - runner.generate(["Hello world"], sampling_params=vllm_sampling_params) + runner = VllmRunner("facebook/opt-125m", + max_logprobs=1, + enable_prefix_caching=False, + max_model_len=256) + vllm_sampling_params = SamplingParams(logprobs=1) + # should pass + runner.generate(["Hello world"], sampling_params=vllm_sampling_params) - bad_sampling_params = SamplingParams(logprobs=2) - with pytest.raises(ValueError): - runner.generate(["Hello world"], sampling_params=bad_sampling_params) + bad_sampling_params = SamplingParams(logprobs=2) + with pytest.raises(ValueError): + runner.generate(["Hello world"], + sampling_params=bad_sampling_params) -def test_none_logprobs(vllm_model, example_prompts): +def test_none_logprobs(vllm_model, example_prompts, + monkeypatch: pytest.MonkeyPatch): """Engine should return `logprobs` and `prompt_logprobs` as `None` - + Args: vllm_model: vLLM model fixture example_prompts: list of example prompts (test fixture) """ - max_tokens = 5 + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + max_tokens = 5 - sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens, - logprobs=None, - prompt_logprobs=None, - temperature=0.0) - results_logprobs_none = vllm_model.model.generate( - example_prompts, sampling_params=sampling_params_logprobs_none) + sampling_params_logprobs_none = SamplingParams( + max_tokens=max_tokens, + logprobs=None, + prompt_logprobs=None, + temperature=0.0, + ) + results_logprobs_none = vllm_model.model.generate( + example_prompts, + sampling_params=sampling_params_logprobs_none, + ) - for i in range(len(results_logprobs_none)): - # Check sample logprobs are None - assert results_logprobs_none[i].outputs[0].logprobs is None - assert results_logprobs_none[i].outputs[0].cumulative_logprob is None - # Check prompt logprobs are None - assert results_logprobs_none[i].prompt_logprobs is None + for i in range(len(results_logprobs_none)): + # Check sample logprobs are None + assert results_logprobs_none[i].outputs[0].logprobs is None + assert results_logprobs_none[i].outputs[ + 0].cumulative_logprob is None + # Check prompt logprobs are None + assert results_logprobs_none[i].prompt_logprobs is None -def test_zero_logprobs(vllm_model, example_prompts): +def test_zero_logprobs(vllm_model, example_prompts, + monkeypatch: pytest.MonkeyPatch): """Engine should return sampled token and prompt token logprobs - + Args: vllm_model: vLLM model fixture example_prompts: list of example prompts (test fixture) """ - max_tokens = 5 + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + max_tokens = 5 - sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens, - logprobs=0, - prompt_logprobs=0, - temperature=0.0) - results_logprobs_zero = vllm_model.model.generate( - example_prompts, sampling_params=sampling_params_logprobs_zero) + sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens, + logprobs=0, + prompt_logprobs=0, + temperature=0.0) + results_logprobs_zero = vllm_model.model.generate( + example_prompts, sampling_params=sampling_params_logprobs_zero) - for i in range(len(results_logprobs_zero)): - # Check that there is one sample logprob dict for each - # sample token - logprobs = results_logprobs_zero[i].outputs[0].logprobs - prompt_logprobs = results_logprobs_zero[i].prompt_logprobs - sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids - prompt_token_ids = results_logprobs_zero[i].prompt_token_ids - assert logprobs is not None - assert len(sampled_token_ids) == len(logprobs) - assert results_logprobs_zero[i].outputs[ - 0].cumulative_logprob is not None - # Check that there is one prompt logprob dict for each - # prompt token - assert prompt_logprobs is not None - assert len(prompt_token_ids) == len(prompt_logprobs) + for i in range(len(results_logprobs_zero)): + # Check that there is one sample logprob dict for each + # sample token + logprobs = results_logprobs_zero[i].outputs[0].logprobs + prompt_logprobs = results_logprobs_zero[i].prompt_logprobs + sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids + prompt_token_ids = results_logprobs_zero[i].prompt_token_ids + assert logprobs is not None + assert len(sampled_token_ids) == len(logprobs) + assert results_logprobs_zero[i].outputs[ + 0].cumulative_logprob is not None + # Check that there is one prompt logprob dict for each + # prompt token + assert prompt_logprobs is not None + assert len(prompt_token_ids) == len(prompt_logprobs) diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py index 0309f545ea49e..241f49e4faea8 100644 --- a/tests/v1/tpu/test_basic.py +++ b/tests/v1/tpu/test_basic.py @@ -3,11 +3,16 @@ Run `pytest tests/v1/tpu/test_basic.py`. """ +from __future__ import annotations + +from typing import TYPE_CHECKING + import pytest from vllm.platforms import current_platform -from ...conftest import VllmRunner +if TYPE_CHECKING: + from tests.conftest import VllmRunner MODELS = [ # "Qwen/Qwen2-7B-Instruct", @@ -28,7 +33,8 @@ TENSOR_PARALLEL_SIZES = [1] @pytest.mark.parametrize("enforce_eager", [True]) @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES) def test_models( - monkeypatch, + vllm_runner: type[VllmRunner], + monkeypatch: pytest.MonkeyPatch, model: str, max_tokens: int, enforce_eager: bool, @@ -41,7 +47,7 @@ def test_models( with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") - with VllmRunner( + with vllm_runner( model, max_model_len=8192, enforce_eager=enforce_eager, @@ -50,5 +56,5 @@ def test_models( tensor_parallel_size=tensor_parallel_size) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - output = vllm_outputs[0][1] - assert "1024" in output + output = vllm_outputs[0][1] + assert "1024" in output