From 36ccdcad2caaa00e244d8a1da605e9983258516b Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 14 Aug 2025 03:34:37 +0000 Subject: [PATCH] updated Signed-off-by: Robert Shaw --- .buildkite/test-pipeline.yaml | 10 ----- tests/test_regression.py | 78 ----------------------------------- 2 files changed, 88 deletions(-) delete mode 100644 tests/test_regression.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 740be2bc87706..243f388f5579f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -227,16 +227,6 @@ steps: ##### fast check tests ##### ##### 1 GPU test ##### -- label: Regression Test # 5min - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/test_regression - commands: - - pip install modelscope - - pytest -v -s test_regression.py - working_dir: "/vllm-workspace/tests" # optional - - label: Engine Test # 10min mirror_hardwares: [amdexperimental] source_file_dependencies: diff --git a/tests/test_regression.py b/tests/test_regression.py deleted file mode 100644 index f5f1ed8e805e0..0000000000000 --- a/tests/test_regression.py +++ /dev/null @@ -1,78 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Containing tests that check for regressions in vLLM's behavior. - -It should include tests that are reported by users and making sure they -will never happen again. - -""" -import gc - -import pytest -import torch - -from vllm import LLM, SamplingParams - - -@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len") -def test_duplicated_ignored_sequence_group(): - """https://github.com/vllm-project/vllm/issues/1655""" - - sampling_params = SamplingParams(temperature=0.01, - top_p=0.1, - max_tokens=256) - llm = LLM(model="distilbert/distilgpt2", - max_num_batched_tokens=4096, - tensor_parallel_size=1) - prompts = ["This is a short prompt", "This is a very long prompt " * 1000] - outputs = llm.generate(prompts, sampling_params=sampling_params) - - assert len(prompts) == len(outputs) - - -def test_max_tokens_none(): - sampling_params = SamplingParams(temperature=0.01, - top_p=0.1, - max_tokens=None) - llm = LLM(model="distilbert/distilgpt2", - max_num_batched_tokens=4096, - tensor_parallel_size=1) - prompts = ["Just say hello!"] - outputs = llm.generate(prompts, sampling_params=sampling_params) - - assert len(prompts) == len(outputs) - - -def test_gc(): - llm = LLM(model="distilbert/distilgpt2", enforce_eager=True) - del llm - - gc.collect() - torch.cuda.empty_cache() - - # The memory allocated for model and KV cache should be released. - # The memory allocated for PyTorch and others should be less than 50MB. - # Usually, it's around 10MB. - allocated = torch.cuda.memory_allocated() - assert allocated < 50 * 1024 * 1024 - - -def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch): - # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary - with monkeypatch.context() as m: - m.setenv("VLLM_USE_MODELSCOPE", "True") - # Don't use HF_TOKEN for ModelScope repos, otherwise it will fail - # with 400 Client Error: Bad Request. - m.setenv("HF_TOKEN", "") - llm = LLM(model="qwen/Qwen1.5-0.5B-Chat") - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - outputs = llm.generate(prompts, sampling_params) - assert len(outputs) == 4