From 2c9ed5b73804c698f0fde39af0e87be1e1b040aa Mon Sep 17 00:00:00 2001 From: AzizCode92 Date: Sat, 6 Sep 2025 22:59:30 +0200 Subject: [PATCH] chore: Improve GPU cleanup in tests Centralizes the GPU memory cleanup logic into a single static method to prevent flaky test failures from OOM errors. Signed-off-by: AzizCode92 --- tests/utils.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index b061caf6a4489..38bdb4007d2ba 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -170,14 +170,18 @@ class RemoteOpenAIServer: except subprocess.TimeoutExpired: # force kill if needed self.proc.kill() - # GPU memory cleanup + self.__cleanup_gpu_memory() + + @staticmethod + def __cleanup_gpu_memory(): try: - if current_platform.is_cuda() or current_platform.is_rocm(): + if current_platform.is_cuda_alike(): num_devices = cuda_device_count_stateless() if num_devices > 0: wait_for_gpu_memory_to_clear(devices=list( range(num_devices)), - threshold_ratio=0.05) + threshold_ratio=0.05, + timeout_s=60) except Exception as e: print(f"GPU cleanup warning: {e}") @@ -276,16 +280,7 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer): # force kill if needed self.proc.kill() - # GPU memory cleaning - try: - if current_platform.is_cuda() or current_platform.is_rocm(): - num_devices = cuda_device_count_stateless() - if num_devices > 0: - wait_for_gpu_memory_to_clear(devices=list( - range(num_devices)), - threshold_ratio=0.05) - except Exception as e: - print(f"GPU cleanup warning: {e}") + self.__cleanup_gpu_memory() def _test_completion(