From 19ffe12f323fa24defac4cf063ec2687f649802c Mon Sep 17 00:00:00 2001
From: AzizCode92 <azizbenothman76@gmail.com>
Date: Thu, 4 Sep 2025 17:50:51 +0200
Subject: [PATCH 1/4] [feat]: ensure the gpu memory is cleaned when exiting the
 remote openAI server

Signed-off-by: AzizCode92 <azizbenothman76@gmail.com>
---
 tests/utils.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/utils.py b/tests/utils.py
index e47235002657d..8eb12e9c4866e 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -4,6 +4,7 @@
 import asyncio
 import copy
 import functools
+import gc
 import importlib
 import json
 import os
@@ -170,6 +171,16 @@ class RemoteOpenAIServer:
         except subprocess.TimeoutExpired:
             # force kill if needed
             self.proc.kill()
+        # GPU memory cleanup
+        try:
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                gc.collect()
+                torch.cuda.synchronize()
+                # Small delay to ensure cleanup completes
+                time.sleep(0.5)
+        except Exception as e:
+            print(f"GPU cleanup warning: {e}")
 
     def _poll(self) -> Optional[int]:
         """Subclasses override this method to customize process polling"""
@@ -266,6 +277,17 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer):
             # force kill if needed
             self.proc.kill()
 
+        # GPU memory cleaning
+        try:
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                gc.collect()
+                torch.cuda.synchronize()
+                # Small delay to ensure cleanup completes
+                time.sleep(0.5)
+        except Exception as e:
+            print(f"GPU cleanup warning: {e}")
+
 
 def _test_completion(
     client: openai.OpenAI,

From 4c8625fdb17cec575cda8f7f09ff901d6c43993a Mon Sep 17 00:00:00 2001
From: AzizCode92 <azizbenothman76@gmail.com>
Date: Thu, 4 Sep 2025 20:35:27 +0200
Subject: [PATCH 2/4] fix: use `wait_for_gpu_memory_to_clear` to clear gpu
 memory

Signed-off-by: AzizCode92 <azizbenothman76@gmail.com>
---
 tests/utils.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/tests/utils.py b/tests/utils.py
index 8eb12e9c4866e..b08009fda0ef8 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -4,7 +4,6 @@
 import asyncio
 import copy
 import functools
-import gc
 import importlib
 import json
 import os
@@ -174,11 +173,10 @@ class RemoteOpenAIServer:
         # GPU memory cleanup
         try:
             if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                gc.collect()
-                torch.cuda.synchronize()
-                # Small delay to ensure cleanup completes
-                time.sleep(0.5)
+                devices_to_clear = list(range(torch.cuda.device_count()))
+                if devices_to_clear:
+                    wait_for_gpu_memory_to_clear(devices=devices_to_clear,
+                                                 threshold_ratio=0.05)
         except Exception as e:
             print(f"GPU cleanup warning: {e}")
 
@@ -280,11 +278,10 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer):
         # GPU memory cleaning
         try:
             if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                gc.collect()
-                torch.cuda.synchronize()
-                # Small delay to ensure cleanup completes
-                time.sleep(0.5)
+                devices_to_clear = list(range(torch.cuda.device_count()))
+                if devices_to_clear:
+                    wait_for_gpu_memory_to_clear(devices=devices_to_clear,
+                                                 threshold_ratio=0.05)
         except Exception as e:
             print(f"GPU cleanup warning: {e}")
 

From 963fe4b0879ccf94f5a6dfd8e8001759872255ee Mon Sep 17 00:00:00 2001
From: AzizCode92 <azizbenothman76@gmail.com>
Date: Fri, 5 Sep 2025 09:34:10 +0200
Subject: [PATCH 3/4] [fix]: ensure the cleaning of the GPU memory is
 hardware-agnostic

Signed-off-by: AzizCode92 <azizbenothman76@gmail.com>
---
 tests/utils.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tests/utils.py b/tests/utils.py
index b08009fda0ef8..b061caf6a4489 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -172,10 +172,11 @@ class RemoteOpenAIServer:
             self.proc.kill()
         # GPU memory cleanup
         try:
-            if torch.cuda.is_available():
-                devices_to_clear = list(range(torch.cuda.device_count()))
-                if devices_to_clear:
-                    wait_for_gpu_memory_to_clear(devices=devices_to_clear,
+            if current_platform.is_cuda() or current_platform.is_rocm():
+                num_devices = cuda_device_count_stateless()
+                if num_devices > 0:
+                    wait_for_gpu_memory_to_clear(devices=list(
+                        range(num_devices)),
                                                  threshold_ratio=0.05)
         except Exception as e:
             print(f"GPU cleanup warning: {e}")
@@ -277,10 +278,11 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer):
 
         # GPU memory cleaning
         try:
-            if torch.cuda.is_available():
-                devices_to_clear = list(range(torch.cuda.device_count()))
-                if devices_to_clear:
-                    wait_for_gpu_memory_to_clear(devices=devices_to_clear,
+            if current_platform.is_cuda() or current_platform.is_rocm():
+                num_devices = cuda_device_count_stateless()
+                if num_devices > 0:
+                    wait_for_gpu_memory_to_clear(devices=list(
+                        range(num_devices)),
                                                  threshold_ratio=0.05)
         except Exception as e:
             print(f"GPU cleanup warning: {e}")

From 2c9ed5b73804c698f0fde39af0e87be1e1b040aa Mon Sep 17 00:00:00 2001
From: AzizCode92 <azizbenothman76@gmail.com>
Date: Sat, 6 Sep 2025 22:59:30 +0200
Subject: [PATCH 4/4] chore: Improve GPU cleanup in tests

Centralizes the GPU memory cleanup logic into a single static method to prevent flaky test failures from OOM errors.

Signed-off-by: AzizCode92 <azizbenothman76@gmail.com>
---
 tests/utils.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/tests/utils.py b/tests/utils.py
index b061caf6a4489..38bdb4007d2ba 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -170,14 +170,18 @@ class RemoteOpenAIServer:
         except subprocess.TimeoutExpired:
             # force kill if needed
             self.proc.kill()
-        # GPU memory cleanup
+        self.__cleanup_gpu_memory()
+
+    @staticmethod
+    def __cleanup_gpu_memory():
         try:
-            if current_platform.is_cuda() or current_platform.is_rocm():
+            if current_platform.is_cuda_alike():
                 num_devices = cuda_device_count_stateless()
                 if num_devices > 0:
                     wait_for_gpu_memory_to_clear(devices=list(
                         range(num_devices)),
-                                                 threshold_ratio=0.05)
+                                                 threshold_ratio=0.05,
+                                                 timeout_s=60)
         except Exception as e:
             print(f"GPU cleanup warning: {e}")
 
@@ -276,16 +280,7 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer):
             # force kill if needed
             self.proc.kill()
 
-        # GPU memory cleaning
-        try:
-            if current_platform.is_cuda() or current_platform.is_rocm():
-                num_devices = cuda_device_count_stateless()
-                if num_devices > 0:
-                    wait_for_gpu_memory_to_clear(devices=list(
-                        range(num_devices)),
-                                                 threshold_ratio=0.05)
-        except Exception as e:
-            print(f"GPU cleanup warning: {e}")
+        self.__cleanup_gpu_memory()
 
 
 def _test_completion(