From 6c4dbe23eb85e5d1da00ccaf4923a275d8769a7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=84=8D=F0=9D=95=A0=F0=9D=95=9D=F0=9D=95=9D=F0=9D=95=A0?= =?UTF-8?q?=F0=9D=95=A8=20=F0=9D=95=84=F0=9D=95=92=F0=9D=95=9F?= Date: Tue, 11 Feb 2025 18:21:50 +0200 Subject: [PATCH] [BugFix] Pop instead of del CUDA_VISIBLE_DEVICES (#12962) Signed-off-by: Hollow Man --- examples/offline_inference/rlhf.py | 2 +- examples/offline_inference/rlhf_colocate.py | 2 +- tests/distributed/test_comm_ops.py | 10 +++++----- tests/distributed/test_custom_all_reduce.py | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py index 5000251c099f7..172d18cbce2f9 100644 --- a/examples/offline_inference/rlhf.py +++ b/examples/offline_inference/rlhf.py @@ -92,7 +92,7 @@ class MyLLM(LLM): # a hack to make the script work. # stop ray from manipulating CUDA_VISIBLE_DEVICES # at the top-level - del os.environ["CUDA_VISIBLE_DEVICES"] + os.environ.pop("CUDA_VISIBLE_DEVICES", None) super().__init__(*args, **kwargs) diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py index b921bc71feb99..15dc7edc18ad9 100644 --- a/examples/offline_inference/rlhf_colocate.py +++ b/examples/offline_inference/rlhf_colocate.py @@ -59,7 +59,7 @@ class MyLLM(LLM): # a hack to make the script work. # stop ray from manipulating CUDA_VISIBLE_DEVICES # at the top-level - del os.environ["CUDA_VISIBLE_DEVICES"] + os.environ.pop("CUDA_VISIBLE_DEVICES", None) # every worker will use 0.4 GPU, so that we can schedule # 2 instances on the same GPUs. os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4" diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index bc916e8de07c4..7b0346b8ab50f 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -22,7 +22,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int, # it is important to delete the CUDA_VISIBLE_DEVICES environment variable # so that each worker can see all the GPUs # they will be able to set the device to the correct GPU - del os.environ["CUDA_VISIBLE_DEVICES"] + os.environ.pop("CUDA_VISIBLE_DEVICES", None) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -44,7 +44,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int, # it is important to delete the CUDA_VISIBLE_DEVICES environment variable # so that each worker can see all the GPUs # they will be able to set the device to the correct GPU - del os.environ["CUDA_VISIBLE_DEVICES"] + os.environ.pop("CUDA_VISIBLE_DEVICES", None) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -72,7 +72,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, # it is important to delete the CUDA_VISIBLE_DEVICES environment variable # so that each worker can see all the GPUs # they will be able to set the device to the correct GPU - del os.environ["CUDA_VISIBLE_DEVICES"] + os.environ.pop("CUDA_VISIBLE_DEVICES", None) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -108,7 +108,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, @ray.remote(num_gpus=1, max_calls=1) def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, distributed_init_port: str): - del os.environ["CUDA_VISIBLE_DEVICES"] + os.environ.pop("CUDA_VISIBLE_DEVICES", None) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -148,7 +148,7 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int, @ray.remote(num_gpus=1, max_calls=1) def send_recv_test_worker(tp_size: int, pp_size: int, rank: int, distributed_init_port: str): - del os.environ["CUDA_VISIBLE_DEVICES"] + os.environ.pop("CUDA_VISIBLE_DEVICES", None) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 46887bca42a90..4928690bebb07 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -24,7 +24,7 @@ for i, v in enumerate(test_sizes): @ray.remote(num_gpus=1, max_calls=1) def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): - del os.environ["CUDA_VISIBLE_DEVICES"] + os.environ.pop("CUDA_VISIBLE_DEVICES", None) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank, @@ -80,7 +80,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): @ray.remote(num_gpus=1, max_calls=1) def eager_allreduce(tp_size, pp_size, rank, distributed_init_port): - del os.environ["CUDA_VISIBLE_DEVICES"] + os.environ.pop("CUDA_VISIBLE_DEVICES", None) device = torch.device(f"cuda:{rank}") torch.cuda.set_device(device) init_test_distributed_environment(tp_size, pp_size, rank,