[ci] set timeout for test_oot_registration.py (#7082)

2026-07-22 19:17:32 +08:00 · 2024-08-02 10:03:24 -07:00 · 2024-08-02 10:03:24 -07:00 · 806949514a
commit 806949514a
parent c16eaac500
3 changed files with 10 additions and 2 deletions
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@ -36,10 +36,12 @@ def test_oot_registration_for_api_server():
    ctx = torch.multiprocessing.get_context()
    server = ctx.Process(target=server_function, args=(port, ))
    server.start()
+    MAX_SERVER_START_WAIT_S = 60
    client = OpenAI(
        base_url=f"http://localhost:{port}/v1",
        api_key="token-abc123",
    )
+    now = time.time()
    while True:
        try:
            completion = client.chat.completions.create(
@ -57,6 +59,8 @@ def test_oot_registration_for_api_server():
        except OpenAIError as e:
            if "Connection error" in str(e):
                time.sleep(3)
+                if time.time() - now > MAX_SERVER_START_WAIT_S:
+                    raise RuntimeError("Server did not start in time") from e
            else:
                raise e
    server.kill()
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@ -186,7 +186,9 @@ class Worker(LocalOrDistributedWorkerBase):
        # GPU did not change their memory usage during the profiling.
        peak_memory = self.init_gpu_memory - free_gpu_memory
        assert peak_memory > 0, (
-            "Error in memory profiling. This happens when the GPU memory was "
+            "Error in memory profiling. "
+            f"Initial free memory {self.init_gpu_memory}, current free memory"
+            f" {free_gpu_memory}. This happens when the GPU memory was "
            "not properly cleaned up before initializing the vLLM instance.")

        cache_block_size = self.get_cache_block_size_bytes()
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@ -138,7 +138,9 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
        # GPU did not change their memory usage during the profiling.
        peak_memory = self.init_gpu_memory - free_gpu_memory
        assert peak_memory > 0, (
-            "Error in memory profiling. This happens when the GPU memory was "
+            "Error in memory profiling. "
+            f"Initial free memory {self.init_gpu_memory}, current free memory"
+            f" {free_gpu_memory}. This happens when the GPU memory was "
            "not properly cleaned up before initializing the vLLM instance.")

        cache_block_size = self.get_cache_block_size_bytes()