From 36ccdcad2caaa00e244d8a1da605e9983258516b Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Thu, 14 Aug 2025 03:34:37 +0000
Subject: [PATCH] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 .buildkite/test-pipeline.yaml | 10 -----
 tests/test_regression.py      | 78 -----------------------------------
 2 files changed, 88 deletions(-)
 delete mode 100644 tests/test_regression.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 740be2bc87706..243f388f5579f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -227,16 +227,6 @@ steps:
 ##### fast check tests  #####
 #####  1 GPU test  #####
 
-- label: Regression Test # 5min
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/
-  - tests/test_regression
-  commands:
-  - pip install modelscope
-  - pytest -v -s test_regression.py
-  working_dir: "/vllm-workspace/tests" # optional
-
 - label: Engine Test # 10min
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
diff --git a/tests/test_regression.py b/tests/test_regression.py
deleted file mode 100644
index f5f1ed8e805e0..0000000000000
--- a/tests/test_regression.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Containing tests that check for regressions in vLLM's behavior.
-
-It should include tests that are reported by users and making sure they
-will never happen again.
-
-"""
-import gc
-
-import pytest
-import torch
-
-from vllm import LLM, SamplingParams
-
-
-@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
-def test_duplicated_ignored_sequence_group():
-    """https://github.com/vllm-project/vllm/issues/1655"""
-
-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=256)
-    llm = LLM(model="distilbert/distilgpt2",
-              max_num_batched_tokens=4096,
-              tensor_parallel_size=1)
-    prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
-    outputs = llm.generate(prompts, sampling_params=sampling_params)
-
-    assert len(prompts) == len(outputs)
-
-
-def test_max_tokens_none():
-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=None)
-    llm = LLM(model="distilbert/distilgpt2",
-              max_num_batched_tokens=4096,
-              tensor_parallel_size=1)
-    prompts = ["Just say hello!"]
-    outputs = llm.generate(prompts, sampling_params=sampling_params)
-
-    assert len(prompts) == len(outputs)
-
-
-def test_gc():
-    llm = LLM(model="distilbert/distilgpt2", enforce_eager=True)
-    del llm
-
-    gc.collect()
-    torch.cuda.empty_cache()
-
-    # The memory allocated for model and KV cache should be released.
-    # The memory allocated for PyTorch and others should be less than 50MB.
-    # Usually, it's around 10MB.
-    allocated = torch.cuda.memory_allocated()
-    assert allocated < 50 * 1024 * 1024
-
-
-def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
-    # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_MODELSCOPE", "True")
-        # Don't use HF_TOKEN for ModelScope repos, otherwise it will fail
-        # with 400 Client Error: Bad Request.
-        m.setenv("HF_TOKEN", "")
-        llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
-
-        prompts = [
-            "Hello, my name is",
-            "The president of the United States is",
-            "The capital of France is",
-            "The future of AI is",
-        ]
-        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-        outputs = llm.generate(prompts, sampling_params)
-        assert len(outputs) == 4