mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-18 04:15:01 +08:00
[BugFix] Pop instead of del CUDA_VISIBLE_DEVICES (#12962)
Signed-off-by: Hollow Man <hollowman@opensuse.org>
This commit is contained in:
parent
21f5d50fa5
commit
6c4dbe23eb
@ -92,7 +92,7 @@ class MyLLM(LLM):
|
|||||||
# a hack to make the script work.
|
# a hack to make the script work.
|
||||||
# stop ray from manipulating CUDA_VISIBLE_DEVICES
|
# stop ray from manipulating CUDA_VISIBLE_DEVICES
|
||||||
# at the top-level
|
# at the top-level
|
||||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -59,7 +59,7 @@ class MyLLM(LLM):
|
|||||||
# a hack to make the script work.
|
# a hack to make the script work.
|
||||||
# stop ray from manipulating CUDA_VISIBLE_DEVICES
|
# stop ray from manipulating CUDA_VISIBLE_DEVICES
|
||||||
# at the top-level
|
# at the top-level
|
||||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||||
# every worker will use 0.4 GPU, so that we can schedule
|
# every worker will use 0.4 GPU, so that we can schedule
|
||||||
# 2 instances on the same GPUs.
|
# 2 instances on the same GPUs.
|
||||||
os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
|
os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
|
||||||
|
|||||||
@ -22,7 +22,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
|
|||||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||||
# so that each worker can see all the GPUs
|
# so that each worker can see all the GPUs
|
||||||
# they will be able to set the device to the correct GPU
|
# they will be able to set the device to the correct GPU
|
||||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||||
device = torch.device(f"cuda:{rank}")
|
device = torch.device(f"cuda:{rank}")
|
||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||||
@ -44,7 +44,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
|
|||||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||||
# so that each worker can see all the GPUs
|
# so that each worker can see all the GPUs
|
||||||
# they will be able to set the device to the correct GPU
|
# they will be able to set the device to the correct GPU
|
||||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||||
device = torch.device(f"cuda:{rank}")
|
device = torch.device(f"cuda:{rank}")
|
||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||||
@ -72,7 +72,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
|||||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||||
# so that each worker can see all the GPUs
|
# so that each worker can see all the GPUs
|
||||||
# they will be able to set the device to the correct GPU
|
# they will be able to set the device to the correct GPU
|
||||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||||
device = torch.device(f"cuda:{rank}")
|
device = torch.device(f"cuda:{rank}")
|
||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||||
@ -108,7 +108,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
|||||||
@ray.remote(num_gpus=1, max_calls=1)
|
@ray.remote(num_gpus=1, max_calls=1)
|
||||||
def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||||
distributed_init_port: str):
|
distributed_init_port: str):
|
||||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||||
device = torch.device(f"cuda:{rank}")
|
device = torch.device(f"cuda:{rank}")
|
||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||||
@ -148,7 +148,7 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
|
|||||||
@ray.remote(num_gpus=1, max_calls=1)
|
@ray.remote(num_gpus=1, max_calls=1)
|
||||||
def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
|
def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
|
||||||
distributed_init_port: str):
|
distributed_init_port: str):
|
||||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||||
device = torch.device(f"cuda:{rank}")
|
device = torch.device(f"cuda:{rank}")
|
||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||||
|
|||||||
@ -24,7 +24,7 @@ for i, v in enumerate(test_sizes):
|
|||||||
|
|
||||||
@ray.remote(num_gpus=1, max_calls=1)
|
@ray.remote(num_gpus=1, max_calls=1)
|
||||||
def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
|
def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
|
||||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||||
device = torch.device(f"cuda:{rank}")
|
device = torch.device(f"cuda:{rank}")
|
||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||||
@ -80,7 +80,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
|
|||||||
|
|
||||||
@ray.remote(num_gpus=1, max_calls=1)
|
@ray.remote(num_gpus=1, max_calls=1)
|
||||||
def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
|
def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
|
||||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
|
||||||
device = torch.device(f"cuda:{rank}")
|
device = torch.device(f"cuda:{rank}")
|
||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user