[BugFix] Increase timeout for startup failure test (#17642)

Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Nick Hill 2025-05-05 13:53:19 -07:00 committed by GitHub
parent d3efde8176
commit 5ea5c514da
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,12 +1,13 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import asyncio import asyncio
import os
import signal
import time import time
import uuid import uuid
from threading import Thread from threading import Thread
from typing import Optional from typing import Optional
import psutil
import pytest import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
@ -17,8 +18,8 @@ from vllm.platforms import current_platform
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core import EngineCore from vllm.v1.engine.core import EngineCore
from vllm.v1.engine.core_client import (AsyncMPClient, EngineCoreClient, from vllm.v1.engine.core_client import (AsyncMPClient, CoreEngine,
SyncMPClient) EngineCoreClient, SyncMPClient)
from vllm.v1.executor.abstract import Executor from vllm.v1.executor.abstract import Executor
from ...distributed.conftest import MockSubscriber from ...distributed.conftest import MockSubscriber
@ -337,34 +338,40 @@ def test_kv_cache_events(
"Token ids should be the same as the custom tokens") "Token ids should be the same as the custom tokens")
finally: finally:
client.shutdown() client.shutdown()
return
@pytest.mark.timeout(10) @pytest.mark.timeout(20)
def test_startup_failure(monkeypatch: pytest.MonkeyPatch): def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m, pytest.raises(Exception) as e_info: with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
# Monkey-patch to extract core process pid while it's starting.
core_proc_pid = [None]
ce_ctor = CoreEngine.__init__
def patched_ce_ctor(self, *args, **kwargs):
ce_ctor(self, *args, **kwargs)
core_proc_pid[0] = self.proc_handle.proc.pid
m.setattr(CoreEngine, "__init__", patched_ce_ctor)
t = time.time()
engine_args = EngineArgs(model=MODEL_NAME) engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config( vllm_config = engine_args.create_engine_config(
usage_context=UsageContext.UNKNOWN_CONTEXT) usage_context=UsageContext.UNKNOWN_CONTEXT)
executor_class = Executor.get_class(vllm_config) executor_class = Executor.get_class(vllm_config)
print(f"VllmConfig creation took {time.time() - t:.2f} seconds.")
# Start another thread to wait for engine core process to start # Start another thread to wait for engine core process to start
# and kill it - simulate fatal uncaught process exit. # and kill it - simulate fatal uncaught process exit.
this_proc = psutil.Process()
children_before = set(this_proc.children())
def kill_first_child(): def kill_first_child():
while True: while (child_pid := core_proc_pid[0]) is None:
time.sleep(0.5) time.sleep(0.5)
children = set(this_proc.children()) - children_before print(f"Killing child core process {child_pid}")
if children: assert isinstance(child_pid, int)
child = children.pop() os.kill(child_pid, signal.SIGKILL)
print("Killing child core process", child.pid)
child.kill()
break
Thread(target=kill_first_child, daemon=True).start() Thread(target=kill_first_child, daemon=True).start()