mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-27 02:21:48 +08:00
[Core][Test] move local_rank to the last arg with default value(#3711)
[Core][Test] move local_rank to the last arg with default value to keep api compatible (#3711)
This commit is contained in:
parent
395aa823ea
commit
756b30a5f3
@ -24,7 +24,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
|
||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(1, tensor_parallel_size, rank, rank,
|
||||
init_test_distributed_environment(1, tensor_parallel_size, rank,
|
||||
distributed_init_port)
|
||||
num_elements = 8
|
||||
all_tensors = [
|
||||
@ -46,7 +46,7 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
|
||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(1, tensor_parallel_size, rank, rank,
|
||||
init_test_distributed_environment(1, tensor_parallel_size, rank,
|
||||
distributed_init_port)
|
||||
num_dimensions = 3
|
||||
tensor_size = list(range(2, num_dimensions + 2))
|
||||
@ -74,7 +74,7 @@ def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int,
|
||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(1, tensor_parallel_size, rank, rank,
|
||||
init_test_distributed_environment(1, tensor_parallel_size, rank,
|
||||
distributed_init_port)
|
||||
test_dict = {
|
||||
"a": torch.arange(8, dtype=torch.float32, device="cuda"),
|
||||
|
||||
@ -23,7 +23,7 @@ def graph_allreduce(world_size, rank, distributed_init_port):
|
||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(1, world_size, rank, rank,
|
||||
init_test_distributed_environment(1, world_size, rank,
|
||||
distributed_init_port)
|
||||
|
||||
custom_ar.init_custom_ar()
|
||||
@ -58,7 +58,7 @@ def eager_allreduce(world_size, rank, distributed_init_port):
|
||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(1, world_size, rank, rank,
|
||||
init_test_distributed_environment(1, world_size, rank,
|
||||
distributed_init_port)
|
||||
|
||||
sz = 1024
|
||||
|
||||
@ -14,7 +14,9 @@ def distributed_run(fn, world_size):
|
||||
for i in range(number_of_processes):
|
||||
env = os.environ.copy()
|
||||
env['RANK'] = str(i)
|
||||
env['LOCAL_RANK'] = str(i)
|
||||
env['WORLD_SIZE'] = str(number_of_processes)
|
||||
env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
|
||||
env['MASTER_ADDR'] = 'localhost'
|
||||
env['MASTER_PORT'] = '12345'
|
||||
p = multiprocessing.Process(target=fn, args=(env, ))
|
||||
|
||||
@ -202,11 +202,11 @@ class NCCLCommunicator:
|
||||
init_method=None,
|
||||
timeout=datetime.timedelta(seconds=10),
|
||||
world_size: int = -1,
|
||||
local_rank: int = -1,
|
||||
rank: int = -1,
|
||||
store=None,
|
||||
group_name: str = "",
|
||||
pg_options=None,
|
||||
local_rank: int = -1,
|
||||
):
|
||||
if not dist.is_initialized():
|
||||
backend = backend or "nccl"
|
||||
@ -220,6 +220,11 @@ class NCCLCommunicator:
|
||||
store=store,
|
||||
group_name=group_name,
|
||||
pg_options=pg_options)
|
||||
self.rank = dist.get_rank()
|
||||
self.world_size = dist.get_world_size()
|
||||
if local_rank == -1:
|
||||
local_rank = self.rank
|
||||
self.local_rank = local_rank
|
||||
torch.cuda.set_device(local_rank)
|
||||
if rank == 0:
|
||||
self.unique_id = ncclGetUniqueId()
|
||||
|
||||
@ -35,8 +35,10 @@ def set_pynccl_stream(stream: torch.cuda.Stream):
|
||||
pass
|
||||
|
||||
|
||||
def init_process_group(world_size: int, local_rank: int, rank: int,
|
||||
init_method: str) -> None:
|
||||
def init_process_group(world_size: int,
|
||||
rank: int,
|
||||
init_method: str,
|
||||
local_rank: int = -1) -> None:
|
||||
assert not is_initialized()
|
||||
global comm
|
||||
logger.info(f"vLLM is using nccl=={ncclGetVersion()}")
|
||||
|
||||
@ -8,9 +8,9 @@ from vllm.worker.worker import init_distributed_environment
|
||||
def init_test_distributed_environment(
|
||||
pipeline_parallel_size: int,
|
||||
tensor_parallel_size: int,
|
||||
local_rank: int,
|
||||
rank: int,
|
||||
distributed_init_port: str,
|
||||
local_rank: int = -1,
|
||||
) -> None:
|
||||
parallel_config = ParallelConfig(pipeline_parallel_size,
|
||||
tensor_parallel_size,
|
||||
@ -18,9 +18,9 @@ def init_test_distributed_environment(
|
||||
distributed_init_method = f"tcp://localhost:{distributed_init_port}"
|
||||
init_distributed_environment(
|
||||
parallel_config,
|
||||
local_rank,
|
||||
rank,
|
||||
distributed_init_method=distributed_init_method)
|
||||
distributed_init_method=distributed_init_method,
|
||||
local_rank=local_rank)
|
||||
|
||||
|
||||
def multi_process_tensor_parallel(
|
||||
|
||||
@ -97,8 +97,9 @@ class Worker:
|
||||
raise RuntimeError(
|
||||
f"Not support device type: {self.device_config.device}")
|
||||
# Initialize the distributed environment.
|
||||
init_distributed_environment(self.parallel_config, self.local_rank,
|
||||
self.rank, self.distributed_init_method)
|
||||
init_distributed_environment(self.parallel_config, self.rank,
|
||||
self.distributed_init_method,
|
||||
self.local_rank)
|
||||
# Set random seed.
|
||||
set_random_seed(self.model_config.seed)
|
||||
|
||||
@ -249,9 +250,9 @@ class Worker:
|
||||
|
||||
def init_distributed_environment(
|
||||
parallel_config: ParallelConfig,
|
||||
local_rank: int,
|
||||
rank: int,
|
||||
distributed_init_method: Optional[str] = None,
|
||||
local_rank: int = -1,
|
||||
) -> None:
|
||||
"""Initialize the distributed environment."""
|
||||
if torch.distributed.is_initialized():
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user