From a64a84433d6d1162290bbcd48b161c8ddff2c801 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Thu, 20 Feb 2025 01:20:15 -0800
Subject: [PATCH] [2/n][ci] S3: Use full model path (#13564)

Signed-off-by: <>
---
 tests/basic_correctness/test_cumem.py       |  2 +-
 tests/conftest.py                           |  3 +--
 tests/engine/test_computed_prefix_blocks.py |  3 ++-
 tests/engine/test_detokenization.py         |  3 ++-
 tests/engine/test_executor.py               | 12 ++++++++----
 tests/engine/test_skip_tokenizer_init.py    |  3 ++-
 tests/test_config.py                        | 13 +++++++------
 tests/test_regression.py                    |  6 +++---
 8 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 7ebccdb5caed..f1148fc8e3f4 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -121,7 +121,7 @@ def test_cumem_with_cudagraph():
     "model, use_v1",
     [
         # sleep mode with safetensors
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B", True),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True),
         # sleep mode with pytorch checkpoint
         ("facebook/opt-125m", False),
     ])
diff --git a/tests/conftest.py b/tests/conftest.py
index ca268dd6657c..9304b8f17dca 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -746,8 +746,7 @@ class VllmRunner:
         **kwargs,
     ) -> None:
         if model_name in MODELS_ON_S3 and not load_format:
-            model_name = (f"s3://vllm-ci-model-weights/"
-                          f"{model_name.split('/')[-1]}")
+            model_name = (f"{MODEL_WEIGHTS_S3_BUCKET}/{model_name}")
             load_format = LoadFormat.RUNAI_STREAMER
         if not load_format:
             load_format = LoadFormat.AUTO
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
index 93907ecae554..51e7c8e7739d 100644
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -10,7 +10,8 @@ from vllm.sampling_params import SamplingParams
 from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
 
-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 @pytest.mark.parametrize("block_size", [16])
 def test_computed_prefix_blocks(model: str, block_size: int):
     # This test checks if we are able to run the engine to completion
diff --git a/tests/engine/test_detokenization.py b/tests/engine/test_detokenization.py
index ab594aeee40d..6ae4be2e4786 100644
--- a/tests/engine/test_detokenization.py
+++ b/tests/engine/test_detokenization.py
@@ -9,7 +9,8 @@ from vllm.sampling_params import SamplingParams
 from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
 
-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_computed_prefix_blocks(model: str):
     # This test checks if the engine generates completions both with and
     # without optional detokenization, that detokenization includes text
diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py
index 31c07e709bd9..6a86401ce5db 100644
--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -38,7 +38,8 @@ class CustomUniExecutor(UniProcExecutor):
 CustomUniExecutorAsync = CustomUniExecutor
 
 
-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_custom_executor_type_checking(model):
     with pytest.raises(ValueError):
         engine_args = EngineArgs(model=model,
@@ -51,7 +52,8 @@ def test_custom_executor_type_checking(model):
         AsyncLLMEngine.from_engine_args(engine_args)
 
 
-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_custom_executor(model, tmp_path):
     cwd = os.path.abspath(".")
     os.chdir(tmp_path)
@@ -75,7 +77,8 @@ def test_custom_executor(model, tmp_path):
         os.chdir(cwd)
 
 
-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_custom_executor_async(model, tmp_path):
     cwd = os.path.abspath(".")
     os.chdir(tmp_path)
@@ -103,7 +106,8 @@ def test_custom_executor_async(model, tmp_path):
         os.chdir(cwd)
 
 
-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_respect_ray(model):
     # even for TP=1 and PP=1,
     # if users specify ray, we should use ray.
diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
index fee7fd3f6aad..b0930eaac17b 100644
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -9,7 +9,8 @@ from vllm.sampling_params import SamplingParams
 from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
 
-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_skip_tokenizer_initialization(model: str):
     # This test checks if the flag skip_tokenizer_init skips the initialization
     # of tokenizer and detokenizer. The generated output is expected to contain
diff --git a/tests/test_config.py b/tests/test_config.py
index 4a1718613302..bc87e6ccdfcc 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -14,13 +14,14 @@ from .conftest import MODEL_WEIGHTS_S3_BUCKET
 @pytest.mark.parametrize(
     ("model_id", "expected_runner_type", "expected_task"),
     [
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2", "generate", "generate"),
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/e5-mistral-7b-instruct", "pooling",
-         "embed"),
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/Qwen2.5-1.5B-apeach", "pooling",
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", "generate",
+         "generate"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/intfloat/e5-mistral-7b-instruct",
+         "pooling", "embed"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/jason9693/Qwen2.5-1.5B-apeach", "pooling",
          "classify"),
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/ms-marco-MiniLM-L-6-v2", "pooling",
-         "score"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/cross-encoder/ms-marco-MiniLM-L-6-v2",
+         "pooling", "score"),
         ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
         ("openai/whisper-small", "transcription", "transcription"),
     ],
diff --git a/tests/test_regression.py b/tests/test_regression.py
index e9b21e1a7232..8cecc2892b6e 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -21,7 +21,7 @@ def test_duplicated_ignored_sequence_group():
     sampling_params = SamplingParams(temperature=0.01,
                                      top_p=0.1,
                                      max_tokens=256)
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
               load_format=LoadFormat.RUNAI_STREAMER,
               max_num_batched_tokens=4096,
               tensor_parallel_size=1)
@@ -35,7 +35,7 @@ def test_max_tokens_none():
     sampling_params = SamplingParams(temperature=0.01,
                                      top_p=0.1,
                                      max_tokens=None)
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
               load_format=LoadFormat.RUNAI_STREAMER,
               max_num_batched_tokens=4096,
               tensor_parallel_size=1)
@@ -46,7 +46,7 @@ def test_max_tokens_none():
 
 
 def test_gc():
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
               load_format=LoadFormat.RUNAI_STREAMER,
               enforce_eager=True)
     del llm