From 3de6e6a30e33406c2b3bb81fb5a82a2966cebd87 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 3 Jul 2024 16:40:31 -0700 Subject: [PATCH] [core][distributed] support n layers % pp size != 0 (#6115) --- .buildkite/test-pipeline.yaml | 1 + vllm/config.py | 15 ++++++--------- vllm/distributed/utils.py | 9 ++++++++- vllm/worker/openvino_worker.py | 1 + vllm/worker/tpu_worker.py | 1 + vllm/worker/worker.py | 1 + vllm/worker/xpu_worker.py | 1 + 7 files changed, 19 insertions(+), 10 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d127278aaae2d..3680bfdde8187 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -80,6 +80,7 @@ steps: commands: - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py + - TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py diff --git a/vllm/config.py b/vllm/config.py index 0004622ced60d..1ea2888796808 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -265,8 +265,6 @@ class ModelConfig: " must be divisible by tensor parallel size " f"({tensor_parallel_size}).") - total_num_hidden_layers = getattr(self.hf_text_config, - "num_hidden_layers", 0) pipeline_parallel_size = parallel_config.pipeline_parallel_size architectures = getattr(self.hf_config, "architectures", []) if not all(arch in _PP_SUPPORTED_MODELS @@ -275,12 +273,6 @@ class ModelConfig: "Pipeline parallelism is only supported for the following " f" architectures: {_PP_SUPPORTED_MODELS}.") - if total_num_hidden_layers % pipeline_parallel_size != 0: - raise ValueError( - f"Total number of hidden layers ({total_num_hidden_layers}) " - "must be divisible by pipeline parallel size " - f"({pipeline_parallel_size}).") - if self.quantization == "bitsandbytes" and ( parallel_config.tensor_parallel_size > 1 or parallel_config.pipeline_parallel_size > 1): @@ -385,9 +377,13 @@ class ModelConfig: return num_heads // parallel_config.tensor_parallel_size def get_num_layers(self, parallel_config: "ParallelConfig") -> int: + from vllm.distributed.utils import get_pp_indices total_num_hidden_layers = getattr(self.hf_text_config, "num_hidden_layers", 0) - return total_num_hidden_layers // parallel_config.pipeline_parallel_size + pp_rank = parallel_config.rank // parallel_config.tensor_parallel_size + pp_size = parallel_config.pipeline_parallel_size + start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size) + return end - start def contains_seqlen_agnostic_layers( self, parallel_config: "ParallelConfig") -> bool: @@ -709,6 +705,7 @@ class ParallelConfig: {"CUDA_VISIBLE_DEVICES": envs.CUDA_VISIBLE_DEVICES}) self._verify_args() + self.rank = 0 def _verify_args(self) -> None: if (self.pipeline_parallel_size > 1 diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 4e4206e5893aa..b5cf6c45f478f 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -50,8 +50,15 @@ def split_tensor_along_last_dim( def get_pp_indices(num_hidden_layers: int, pp_rank: int, pp_size: int) -> Tuple[int, int]: - layers_per_partition = divide(num_hidden_layers, pp_size) + """Try to evenly distribute layers across partitions. + If the number of layers is not divisible by the number of partitions, + the last partition will have the remaining layers. + """ + layers_per_partition = num_hidden_layers // pp_size start_layer = pp_rank * layers_per_partition end_layer = start_layer + layers_per_partition + if pp_rank == pp_size - 1: + end_layer = num_hidden_layers + return (start_layer, end_layer) diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index 8ac6f1704653c..c47f9acc4423d 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -154,6 +154,7 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase): ) -> None: self.model_config = model_config self.parallel_config = parallel_config + self.parallel_config.rank = rank self.scheduler_config = scheduler_config self.device_config = device_config self.cache_config = cache_config diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 30725473a28b6..60fee9892fba8 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -39,6 +39,7 @@ class TPUWorker(LoraNotSupportedWorkerBase): ) -> None: self.model_config = model_config self.parallel_config = parallel_config + self.parallel_config.rank = rank self.scheduler_config = scheduler_config self.device_config = device_config self.cache_config = cache_config diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 26a176be40a7e..58707269bd68c 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -50,6 +50,7 @@ class Worker(LocalOrDistributedWorkerBase): ) -> None: self.model_config = model_config self.parallel_config = parallel_config + self.parallel_config.rank = rank self.scheduler_config = scheduler_config self.device_config = device_config self.cache_config = cache_config diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index a946eb624a081..94dfcfec37757 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -54,6 +54,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): self.model_config = model_config self.parallel_config = parallel_config + self.parallel_config.rank = rank self.scheduler_config = scheduler_config self.device_config = device_config self.cache_config = cache_config