From 19ffe12f323fa24defac4cf063ec2687f649802c Mon Sep 17 00:00:00 2001 From: AzizCode92 Date: Thu, 4 Sep 2025 17:50:51 +0200 Subject: [PATCH 1/4] [feat]: ensure the gpu memory is cleaned when exiting the remote openAI server Signed-off-by: AzizCode92 --- tests/utils.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/utils.py b/tests/utils.py index e47235002657d..8eb12e9c4866e 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,6 +4,7 @@ import asyncio import copy import functools +import gc import importlib import json import os @@ -170,6 +171,16 @@ class RemoteOpenAIServer: except subprocess.TimeoutExpired: # force kill if needed self.proc.kill() + # GPU memory cleanup + try: + if torch.cuda.is_available(): + torch.cuda.empty_cache() + gc.collect() + torch.cuda.synchronize() + # Small delay to ensure cleanup completes + time.sleep(0.5) + except Exception as e: + print(f"GPU cleanup warning: {e}") def _poll(self) -> Optional[int]: """Subclasses override this method to customize process polling""" @@ -266,6 +277,17 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer): # force kill if needed self.proc.kill() + # GPU memory cleaning + try: + if torch.cuda.is_available(): + torch.cuda.empty_cache() + gc.collect() + torch.cuda.synchronize() + # Small delay to ensure cleanup completes + time.sleep(0.5) + except Exception as e: + print(f"GPU cleanup warning: {e}") + def _test_completion( client: openai.OpenAI, From 4c8625fdb17cec575cda8f7f09ff901d6c43993a Mon Sep 17 00:00:00 2001 From: AzizCode92 Date: Thu, 4 Sep 2025 20:35:27 +0200 Subject: [PATCH 2/4] fix: use `wait_for_gpu_memory_to_clear` to clear gpu memory Signed-off-by: AzizCode92 --- tests/utils.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 8eb12e9c4866e..b08009fda0ef8 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,7 +4,6 @@ import asyncio import copy import functools -import gc import importlib import json import os @@ -174,11 +173,10 @@ class RemoteOpenAIServer: # GPU memory cleanup try: if torch.cuda.is_available(): - torch.cuda.empty_cache() - gc.collect() - torch.cuda.synchronize() - # Small delay to ensure cleanup completes - time.sleep(0.5) + devices_to_clear = list(range(torch.cuda.device_count())) + if devices_to_clear: + wait_for_gpu_memory_to_clear(devices=devices_to_clear, + threshold_ratio=0.05) except Exception as e: print(f"GPU cleanup warning: {e}") @@ -280,11 +278,10 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer): # GPU memory cleaning try: if torch.cuda.is_available(): - torch.cuda.empty_cache() - gc.collect() - torch.cuda.synchronize() - # Small delay to ensure cleanup completes - time.sleep(0.5) + devices_to_clear = list(range(torch.cuda.device_count())) + if devices_to_clear: + wait_for_gpu_memory_to_clear(devices=devices_to_clear, + threshold_ratio=0.05) except Exception as e: print(f"GPU cleanup warning: {e}") From 963fe4b0879ccf94f5a6dfd8e8001759872255ee Mon Sep 17 00:00:00 2001 From: AzizCode92 Date: Fri, 5 Sep 2025 09:34:10 +0200 Subject: [PATCH 3/4] [fix]: ensure the cleaning of the GPU memory is hardware-agnostic Signed-off-by: AzizCode92 --- tests/utils.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index b08009fda0ef8..b061caf6a4489 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -172,10 +172,11 @@ class RemoteOpenAIServer: self.proc.kill() # GPU memory cleanup try: - if torch.cuda.is_available(): - devices_to_clear = list(range(torch.cuda.device_count())) - if devices_to_clear: - wait_for_gpu_memory_to_clear(devices=devices_to_clear, + if current_platform.is_cuda() or current_platform.is_rocm(): + num_devices = cuda_device_count_stateless() + if num_devices > 0: + wait_for_gpu_memory_to_clear(devices=list( + range(num_devices)), threshold_ratio=0.05) except Exception as e: print(f"GPU cleanup warning: {e}") @@ -277,10 +278,11 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer): # GPU memory cleaning try: - if torch.cuda.is_available(): - devices_to_clear = list(range(torch.cuda.device_count())) - if devices_to_clear: - wait_for_gpu_memory_to_clear(devices=devices_to_clear, + if current_platform.is_cuda() or current_platform.is_rocm(): + num_devices = cuda_device_count_stateless() + if num_devices > 0: + wait_for_gpu_memory_to_clear(devices=list( + range(num_devices)), threshold_ratio=0.05) except Exception as e: print(f"GPU cleanup warning: {e}") From 2c9ed5b73804c698f0fde39af0e87be1e1b040aa Mon Sep 17 00:00:00 2001 From: AzizCode92 Date: Sat, 6 Sep 2025 22:59:30 +0200 Subject: [PATCH 4/4] chore: Improve GPU cleanup in tests Centralizes the GPU memory cleanup logic into a single static method to prevent flaky test failures from OOM errors. Signed-off-by: AzizCode92 --- tests/utils.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index b061caf6a4489..38bdb4007d2ba 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -170,14 +170,18 @@ class RemoteOpenAIServer: except subprocess.TimeoutExpired: # force kill if needed self.proc.kill() - # GPU memory cleanup + self.__cleanup_gpu_memory() + + @staticmethod + def __cleanup_gpu_memory(): try: - if current_platform.is_cuda() or current_platform.is_rocm(): + if current_platform.is_cuda_alike(): num_devices = cuda_device_count_stateless() if num_devices > 0: wait_for_gpu_memory_to_clear(devices=list( range(num_devices)), - threshold_ratio=0.05) + threshold_ratio=0.05, + timeout_s=60) except Exception as e: print(f"GPU cleanup warning: {e}") @@ -276,16 +280,7 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer): # force kill if needed self.proc.kill() - # GPU memory cleaning - try: - if current_platform.is_cuda() or current_platform.is_rocm(): - num_devices = cuda_device_count_stateless() - if num_devices > 0: - wait_for_gpu_memory_to_clear(devices=list( - range(num_devices)), - threshold_ratio=0.05) - except Exception as e: - print(f"GPU cleanup warning: {e}") + self.__cleanup_gpu_memory() def _test_completion(