From 0bcc3a160d95d778cd53c96984d2b2b50fa68342 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 25 Sep 2025 20:19:40 +0800 Subject: [PATCH] [CI/Build] Fix flaky entrypoints test (#25663) Signed-off-by: DarkLight1337 --- .../test_completion_with_prompt_embeds.py | 64 ++++++++++++------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index 9c62595ad280b..ae51025455b10 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -15,7 +15,7 @@ from transformers import AutoConfig from ...utils import RemoteOpenAIServer # any model with a chat template should work here -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +MODEL_NAME = "facebook/opt-125m" CONFIG = AutoConfig.from_pretrained(MODEL_NAME) @@ -27,7 +27,7 @@ def default_server_args() -> list[str]: "--dtype", "bfloat16", "--max-model-len", - "8192", + "2048", "--max-num-seqs", "128", "--enforce-eager", @@ -36,6 +36,27 @@ def default_server_args() -> list[str]: ] +EXAMPLE_PROMPTS = [ + "Hello, my name is", + "What is an LLM?", +] + + +def _encode_embeds(embeds: torch.Tensor): + buffer = io.BytesIO() + torch.save(embeds, buffer) + return base64.b64encode(buffer.getvalue()).decode('utf-8') + + +@pytest.fixture(scope="module") +def example_prompt_embeds(hf_runner): + """Create example embeddings and return them as base64 encoded string.""" + with hf_runner(MODEL_NAME) as hf_model: + example_embeddings = hf_model.get_prompt_embeddings(EXAMPLE_PROMPTS) + + return [_encode_embeds(item) for item in example_embeddings] + + @pytest.fixture(scope="module", params=["", "--disable-frontend-multiprocessing"]) def server_with_prompt_embeds(default_server_args, request): @@ -52,21 +73,16 @@ async def client_with_prompt_embeds(server_with_prompt_embeds): yield async_client -def create_dummy_embeds(num_tokens: int = 5) -> str: - """Create dummy embeddings and return them as base64 encoded string.""" - dummy_embeds = torch.randn(num_tokens, CONFIG.hidden_size) - buffer = io.BytesIO() - torch.save(dummy_embeds, buffer) - return base64.b64encode(buffer.getvalue()).decode('utf-8') - - -@pytest.mark.skip("This test is skipped because it is flaky.") @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_completions_with_prompt_embeds( - client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str): + example_prompt_embeds, + client_with_prompt_embeds: openai.AsyncOpenAI, + model_name: str, +): + encoded_embeds, encoded_embeds2 = example_prompt_embeds + # Test case: Single prompt embeds input - encoded_embeds = create_dummy_embeds() completion = await client_with_prompt_embeds.completions.create( model=model_name, prompt="", # Add empty prompt as required parameter @@ -77,7 +93,6 @@ async def test_completions_with_prompt_embeds( assert completion.choices[0].prompt_logprobs is None # Test case: batch completion with prompt_embeds - encoded_embeds2 = create_dummy_embeds() completion = await client_with_prompt_embeds.completions.create( model=model_name, prompt="", # Add empty prompt as required parameter @@ -89,7 +104,6 @@ async def test_completions_with_prompt_embeds( assert len(completion.choices[1].text) >= 1 # Test case: streaming with prompt_embeds - encoded_embeds = create_dummy_embeds() single_completion = await client_with_prompt_embeds.completions.create( model=model_name, prompt="", # Add empty prompt as required parameter @@ -117,7 +131,6 @@ async def test_completions_with_prompt_embeds( assert "".join(chunks) == single_output # Test case: batch streaming with prompt_embeds - encoded_embeds2 = create_dummy_embeds() stream = await client_with_prompt_embeds.completions.create( model=model_name, prompt="", # Add empty prompt as required parameter @@ -139,7 +152,6 @@ async def test_completions_with_prompt_embeds( assert len(chunks_stream_embeds[1]) > 0 # Test case: mixed text and prompt_embeds - encoded_embeds = create_dummy_embeds() completion_mixed = await client_with_prompt_embeds.completions.create( model=model_name, prompt="This is a prompt", @@ -184,10 +196,14 @@ async def test_completions_errors_with_prompt_embeds( @pytest.mark.parametrize("logprobs_arg", [1, 0]) @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_completions_with_logprobs_and_prompt_embeds( - client_with_prompt_embeds: openai.AsyncOpenAI, logprobs_arg: int, - model_name: str): + example_prompt_embeds, + client_with_prompt_embeds: openai.AsyncOpenAI, + logprobs_arg: int, + model_name: str, +): + encoded_embeds, encoded_embeds2 = example_prompt_embeds + # Test case: Logprobs using prompt_embeds - encoded_embeds = create_dummy_embeds() completion = await client_with_prompt_embeds.completions.create( model=model_name, prompt="", # Add empty prompt as required parameter @@ -207,7 +223,6 @@ async def test_completions_with_logprobs_and_prompt_embeds( assert len(logprobs.tokens) == 5 # Test case: Log probs with batch completion and prompt_embeds - encoded_embeds2 = create_dummy_embeds() completion = await client_with_prompt_embeds.completions.create( model=model_name, prompt="", # Add empty prompt as required parameter @@ -232,9 +247,12 @@ async def test_completions_with_logprobs_and_prompt_embeds( @pytest.mark.asyncio async def test_prompt_logprobs_raises_error( - client_with_prompt_embeds: openai.AsyncOpenAI): + example_prompt_embeds, + client_with_prompt_embeds: openai.AsyncOpenAI, +): + encoded_embeds, _ = example_prompt_embeds + with pytest.raises(BadRequestError, match="not compatible"): - encoded_embeds = create_dummy_embeds() await client_with_prompt_embeds.completions.create( model=MODEL_NAME, prompt="",