[CI/Build] Fix flaky entrypoints test (#25663)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-09-25 20:19:40 +08:00 committed by GitHub
parent 70fbdb26e9
commit 0bcc3a160d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -15,7 +15,7 @@ from transformers import AutoConfig
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "facebook/opt-125m"
CONFIG = AutoConfig.from_pretrained(MODEL_NAME) CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
@ -27,7 +27,7 @@ def default_server_args() -> list[str]:
"--dtype", "--dtype",
"bfloat16", "bfloat16",
"--max-model-len", "--max-model-len",
"8192", "2048",
"--max-num-seqs", "--max-num-seqs",
"128", "128",
"--enforce-eager", "--enforce-eager",
@ -36,6 +36,27 @@ def default_server_args() -> list[str]:
] ]
EXAMPLE_PROMPTS = [
"Hello, my name is",
"What is an LLM?",
]
def _encode_embeds(embeds: torch.Tensor):
buffer = io.BytesIO()
torch.save(embeds, buffer)
return base64.b64encode(buffer.getvalue()).decode('utf-8')
@pytest.fixture(scope="module")
def example_prompt_embeds(hf_runner):
"""Create example embeddings and return them as base64 encoded string."""
with hf_runner(MODEL_NAME) as hf_model:
example_embeddings = hf_model.get_prompt_embeddings(EXAMPLE_PROMPTS)
return [_encode_embeds(item) for item in example_embeddings]
@pytest.fixture(scope="module", @pytest.fixture(scope="module",
params=["", "--disable-frontend-multiprocessing"]) params=["", "--disable-frontend-multiprocessing"])
def server_with_prompt_embeds(default_server_args, request): def server_with_prompt_embeds(default_server_args, request):
@ -52,21 +73,16 @@ async def client_with_prompt_embeds(server_with_prompt_embeds):
yield async_client yield async_client
def create_dummy_embeds(num_tokens: int = 5) -> str:
"""Create dummy embeddings and return them as base64 encoded string."""
dummy_embeds = torch.randn(num_tokens, CONFIG.hidden_size)
buffer = io.BytesIO()
torch.save(dummy_embeds, buffer)
return base64.b64encode(buffer.getvalue()).decode('utf-8')
@pytest.mark.skip("This test is skipped because it is flaky.")
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_completions_with_prompt_embeds( async def test_completions_with_prompt_embeds(
client_with_prompt_embeds: openai.AsyncOpenAI, model_name: str): example_prompt_embeds,
client_with_prompt_embeds: openai.AsyncOpenAI,
model_name: str,
):
encoded_embeds, encoded_embeds2 = example_prompt_embeds
# Test case: Single prompt embeds input # Test case: Single prompt embeds input
encoded_embeds = create_dummy_embeds()
completion = await client_with_prompt_embeds.completions.create( completion = await client_with_prompt_embeds.completions.create(
model=model_name, model=model_name,
prompt="", # Add empty prompt as required parameter prompt="", # Add empty prompt as required parameter
@ -77,7 +93,6 @@ async def test_completions_with_prompt_embeds(
assert completion.choices[0].prompt_logprobs is None assert completion.choices[0].prompt_logprobs is None
# Test case: batch completion with prompt_embeds # Test case: batch completion with prompt_embeds
encoded_embeds2 = create_dummy_embeds()
completion = await client_with_prompt_embeds.completions.create( completion = await client_with_prompt_embeds.completions.create(
model=model_name, model=model_name,
prompt="", # Add empty prompt as required parameter prompt="", # Add empty prompt as required parameter
@ -89,7 +104,6 @@ async def test_completions_with_prompt_embeds(
assert len(completion.choices[1].text) >= 1 assert len(completion.choices[1].text) >= 1
# Test case: streaming with prompt_embeds # Test case: streaming with prompt_embeds
encoded_embeds = create_dummy_embeds()
single_completion = await client_with_prompt_embeds.completions.create( single_completion = await client_with_prompt_embeds.completions.create(
model=model_name, model=model_name,
prompt="", # Add empty prompt as required parameter prompt="", # Add empty prompt as required parameter
@ -117,7 +131,6 @@ async def test_completions_with_prompt_embeds(
assert "".join(chunks) == single_output assert "".join(chunks) == single_output
# Test case: batch streaming with prompt_embeds # Test case: batch streaming with prompt_embeds
encoded_embeds2 = create_dummy_embeds()
stream = await client_with_prompt_embeds.completions.create( stream = await client_with_prompt_embeds.completions.create(
model=model_name, model=model_name,
prompt="", # Add empty prompt as required parameter prompt="", # Add empty prompt as required parameter
@ -139,7 +152,6 @@ async def test_completions_with_prompt_embeds(
assert len(chunks_stream_embeds[1]) > 0 assert len(chunks_stream_embeds[1]) > 0
# Test case: mixed text and prompt_embeds # Test case: mixed text and prompt_embeds
encoded_embeds = create_dummy_embeds()
completion_mixed = await client_with_prompt_embeds.completions.create( completion_mixed = await client_with_prompt_embeds.completions.create(
model=model_name, model=model_name,
prompt="This is a prompt", prompt="This is a prompt",
@ -184,10 +196,14 @@ async def test_completions_errors_with_prompt_embeds(
@pytest.mark.parametrize("logprobs_arg", [1, 0]) @pytest.mark.parametrize("logprobs_arg", [1, 0])
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_completions_with_logprobs_and_prompt_embeds( async def test_completions_with_logprobs_and_prompt_embeds(
client_with_prompt_embeds: openai.AsyncOpenAI, logprobs_arg: int, example_prompt_embeds,
model_name: str): client_with_prompt_embeds: openai.AsyncOpenAI,
logprobs_arg: int,
model_name: str,
):
encoded_embeds, encoded_embeds2 = example_prompt_embeds
# Test case: Logprobs using prompt_embeds # Test case: Logprobs using prompt_embeds
encoded_embeds = create_dummy_embeds()
completion = await client_with_prompt_embeds.completions.create( completion = await client_with_prompt_embeds.completions.create(
model=model_name, model=model_name,
prompt="", # Add empty prompt as required parameter prompt="", # Add empty prompt as required parameter
@ -207,7 +223,6 @@ async def test_completions_with_logprobs_and_prompt_embeds(
assert len(logprobs.tokens) == 5 assert len(logprobs.tokens) == 5
# Test case: Log probs with batch completion and prompt_embeds # Test case: Log probs with batch completion and prompt_embeds
encoded_embeds2 = create_dummy_embeds()
completion = await client_with_prompt_embeds.completions.create( completion = await client_with_prompt_embeds.completions.create(
model=model_name, model=model_name,
prompt="", # Add empty prompt as required parameter prompt="", # Add empty prompt as required parameter
@ -232,9 +247,12 @@ async def test_completions_with_logprobs_and_prompt_embeds(
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_prompt_logprobs_raises_error( async def test_prompt_logprobs_raises_error(
client_with_prompt_embeds: openai.AsyncOpenAI): example_prompt_embeds,
client_with_prompt_embeds: openai.AsyncOpenAI,
):
encoded_embeds, _ = example_prompt_embeds
with pytest.raises(BadRequestError, match="not compatible"): with pytest.raises(BadRequestError, match="not compatible"):
encoded_embeds = create_dummy_embeds()
await client_with_prompt_embeds.completions.create( await client_with_prompt_embeds.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
prompt="", prompt="",