mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-06 04:22:13 +08:00
Move ParallelConfig from config/__init__.py to config/parallel.py (#22565)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
56186474f6
commit
2d18256e47
2
.github/CODEOWNERS
vendored
2
.github/CODEOWNERS
vendored
@ -20,7 +20,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
|
|
||||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||||
# so spam a lot of people
|
# so spam a lot of people
|
||||||
/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
|
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||||
|
|||||||
@ -25,13 +25,13 @@ from pydantic import (ConfigDict, SkipValidation, field_validator,
|
|||||||
model_validator)
|
model_validator)
|
||||||
from pydantic.dataclasses import dataclass
|
from pydantic.dataclasses import dataclass
|
||||||
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
|
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
|
||||||
from torch.distributed import ProcessGroup, ReduceOp
|
|
||||||
from typing_extensions import Self, assert_never, runtime_checkable
|
from typing_extensions import Self, assert_never, runtime_checkable
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm import version
|
from vllm import version
|
||||||
from vllm.config.compilation import (CompilationConfig, CompilationLevel,
|
from vllm.config.compilation import (CompilationConfig, CompilationLevel,
|
||||||
PassConfig)
|
PassConfig)
|
||||||
|
from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig
|
||||||
from vllm.config.utils import ConfigType, config
|
from vllm.config.utils import ConfigType, config
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||||
@ -50,20 +50,16 @@ from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
|
|||||||
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
|
MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
|
||||||
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
|
POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
|
||||||
LayerBlockType, LazyLoader, common_broadcastable_dtype,
|
LayerBlockType, LazyLoader, common_broadcastable_dtype,
|
||||||
cuda_device_count_stateless, get_cpu_memory,
|
get_cpu_memory, random_uuid)
|
||||||
get_open_port, random_uuid)
|
|
||||||
|
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from _typeshed import DataclassInstance
|
from _typeshed import DataclassInstance
|
||||||
from ray.runtime_env import RuntimeEnv
|
|
||||||
from ray.util.placement_group import PlacementGroup
|
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
|
|
||||||
import vllm.model_executor.layers.quantization as me_quant
|
import vllm.model_executor.layers.quantization as me_quant
|
||||||
import vllm.model_executor.models as me_models
|
import vllm.model_executor.models as me_models
|
||||||
from vllm.executor.executor_base import ExecutorBase
|
|
||||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||||
from vllm.model_executor.layers.quantization.base_config import (
|
from vllm.model_executor.layers.quantization.base_config import (
|
||||||
QuantizationConfig)
|
QuantizationConfig)
|
||||||
@ -73,10 +69,7 @@ if TYPE_CHECKING:
|
|||||||
HfOverrides = Union[dict, Callable[[type], type]]
|
HfOverrides = Union[dict, Callable[[type], type]]
|
||||||
else:
|
else:
|
||||||
DataclassInstance = Any
|
DataclassInstance = Any
|
||||||
PlacementGroup = Any
|
|
||||||
RuntimeEnv = Any
|
|
||||||
PretrainedConfig = Any
|
PretrainedConfig = Any
|
||||||
ExecutorBase = Any
|
|
||||||
QuantizationConfig = Any
|
QuantizationConfig = Any
|
||||||
QuantizationMethods = Any
|
QuantizationMethods = Any
|
||||||
BaseModelLoader = Any
|
BaseModelLoader = Any
|
||||||
@ -2043,352 +2036,6 @@ class LoadConfig:
|
|||||||
self.ignore_patterns = ["original/**/*"]
|
self.ignore_patterns = ["original/**/*"]
|
||||||
|
|
||||||
|
|
||||||
DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
|
|
||||||
|
|
||||||
|
|
||||||
@config
|
|
||||||
@dataclass
|
|
||||||
class ParallelConfig:
|
|
||||||
"""Configuration for the distributed execution."""
|
|
||||||
|
|
||||||
pipeline_parallel_size: int = 1
|
|
||||||
"""Number of pipeline parallel groups."""
|
|
||||||
tensor_parallel_size: int = 1
|
|
||||||
"""Number of tensor parallel groups."""
|
|
||||||
data_parallel_size: int = 1
|
|
||||||
"""Number of data parallel groups. MoE layers will be sharded according to
|
|
||||||
the product of the tensor parallel size and data parallel size."""
|
|
||||||
data_parallel_size_local: int = 1
|
|
||||||
"""Number of local data parallel groups."""
|
|
||||||
data_parallel_rank: int = 0
|
|
||||||
"""Rank of the data parallel group."""
|
|
||||||
data_parallel_rank_local: Optional[int] = None
|
|
||||||
"""Local rank of the data parallel group,
|
|
||||||
set only in SPMD mode."""
|
|
||||||
data_parallel_master_ip: str = "127.0.0.1"
|
|
||||||
"""IP of the data parallel master."""
|
|
||||||
data_parallel_rpc_port: int = 29550
|
|
||||||
"""Port for data parallel messaging."""
|
|
||||||
data_parallel_master_port: int = 29500
|
|
||||||
"""Port of the data parallel master."""
|
|
||||||
data_parallel_backend: str = "mp"
|
|
||||||
"""Backend to use for data parallel, either "mp" or "ray"."""
|
|
||||||
data_parallel_external_lb: bool = False
|
|
||||||
"""Whether to use "external" DP LB mode. Applies only to online serving
|
|
||||||
and when data_parallel_size > 0. This is useful for a "one-pod-per-rank"
|
|
||||||
wide-EP setup in Kuberentes. Set implicitly when --data-parallel-rank
|
|
||||||
is provided explicitly to vllm serve."""
|
|
||||||
data_parallel_hybrid_lb: bool = False
|
|
||||||
"""Whether to use "hybrid" DP LB mode. Applies only to online serving
|
|
||||||
and when data_parallel_size > 0. Enables running an AsyncLLM
|
|
||||||
and API server on a "per-node" basis where vLLM load balances
|
|
||||||
between local data parallel ranks, but an external LB balances
|
|
||||||
between vLLM nodes/replicas. Set explicitly in conjunction with
|
|
||||||
--data-parallel-start-rank."""
|
|
||||||
enable_expert_parallel: bool = False
|
|
||||||
"""Use expert parallelism instead of tensor parallelism for MoE layers."""
|
|
||||||
enable_eplb: bool = False
|
|
||||||
"""Enable expert parallelism load balancing for MoE layers."""
|
|
||||||
num_redundant_experts: int = 0
|
|
||||||
"""Number of redundant experts to use for expert parallelism."""
|
|
||||||
eplb_window_size: int = 1000
|
|
||||||
"""Window size for expert load recording."""
|
|
||||||
eplb_step_interval: int = 3000
|
|
||||||
"""
|
|
||||||
Interval for rearranging experts in expert parallelism.
|
|
||||||
|
|
||||||
Note that if this is greater than the EPLB window size, only the metrics
|
|
||||||
of the last `eplb_window_size` steps will be used for rearranging experts.
|
|
||||||
"""
|
|
||||||
eplb_log_balancedness: bool = False
|
|
||||||
"""
|
|
||||||
Log the balancedness each step of expert parallelism.
|
|
||||||
This is turned off by default since it will cause communication overhead.
|
|
||||||
"""
|
|
||||||
|
|
||||||
max_parallel_loading_workers: Optional[int] = None
|
|
||||||
"""Maximum number of parallel loading workers when loading model
|
|
||||||
sequentially in multiple batches. To avoid RAM OOM when using tensor
|
|
||||||
parallel and large models."""
|
|
||||||
|
|
||||||
disable_custom_all_reduce: bool = False
|
|
||||||
"""Disable the custom all-reduce kernel and fall back to NCCL."""
|
|
||||||
|
|
||||||
ray_workers_use_nsight: bool = False
|
|
||||||
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
|
|
||||||
|
|
||||||
ray_runtime_env: Optional["RuntimeEnv"] = None
|
|
||||||
"""Ray runtime environment to pass to distributed workers."""
|
|
||||||
|
|
||||||
placement_group: Optional["PlacementGroup"] = None
|
|
||||||
"""ray distributed model workers placement group."""
|
|
||||||
|
|
||||||
distributed_executor_backend: Optional[Union[DistributedExecutorBackend,
|
|
||||||
type["ExecutorBase"]]] = None
|
|
||||||
"""Backend to use for distributed model
|
|
||||||
workers, either "ray" or "mp" (multiprocessing). If the product
|
|
||||||
of pipeline_parallel_size and tensor_parallel_size is less than
|
|
||||||
or equal to the number of GPUs available, "mp" will be used to
|
|
||||||
keep processing on a single host. Otherwise, this will default
|
|
||||||
to "ray" if Ray is installed and fail otherwise. Note that tpu
|
|
||||||
only support Ray for distributed inference."""
|
|
||||||
|
|
||||||
worker_cls: str = "auto"
|
|
||||||
"""The full name of the worker class to use. If "auto", the worker class
|
|
||||||
will be determined based on the platform."""
|
|
||||||
sd_worker_cls: str = "auto"
|
|
||||||
"""The full name of the worker class to use for speculative decoding.
|
|
||||||
If "auto", the worker class will be determined based on the platform."""
|
|
||||||
worker_extension_cls: str = ""
|
|
||||||
"""The full name of the worker extension class to use. The worker extension
|
|
||||||
class is dynamically inherited by the worker class. This is used to inject
|
|
||||||
new attributes and methods to the worker class for use in collective_rpc
|
|
||||||
calls."""
|
|
||||||
|
|
||||||
world_size: int = field(init=False)
|
|
||||||
"""world_size is TPxPP, it affects the number of workers we create."""
|
|
||||||
|
|
||||||
rank: int = 0
|
|
||||||
"""Global rank in distributed setup."""
|
|
||||||
|
|
||||||
enable_multimodal_encoder_data_parallel: bool = False
|
|
||||||
""" Use data parallelism instead of tensor parallelism for vision encoder.
|
|
||||||
Only support LLama4 for now"""
|
|
||||||
|
|
||||||
@property
|
|
||||||
def world_size_across_dp(self) -> int:
|
|
||||||
"""world_size_across_dp is TPxPPxDP, it is the size of the world
|
|
||||||
including data parallelism."""
|
|
||||||
return self.world_size * self.data_parallel_size
|
|
||||||
|
|
||||||
def get_next_dp_init_port(self) -> int:
|
|
||||||
"""
|
|
||||||
We might need to initialize process groups in multiple
|
|
||||||
processes that is related to data parallelism,
|
|
||||||
e.g. both in the worker and in the engine, which
|
|
||||||
can live in different processes. To avoid port conflicts, we
|
|
||||||
increment the port number each time we need to initialize a
|
|
||||||
new process group related to data parallelism.
|
|
||||||
"""
|
|
||||||
answer = self.data_parallel_master_port
|
|
||||||
self.data_parallel_master_port += 1
|
|
||||||
return answer
|
|
||||||
|
|
||||||
def stateless_init_dp_group(self) -> "ProcessGroup":
|
|
||||||
# NOTE: In high-concurrency scenarios multiple processes
|
|
||||||
# can pick the same (currently free) port through a race
|
|
||||||
# condition when calling `get_open_port()`. When the first
|
|
||||||
# process binds the port the others will subsequently fail
|
|
||||||
# with `torch.distributed.DistNetworkError: EADDRINUSE`.
|
|
||||||
# To make the initialization more robust we retry a few times
|
|
||||||
# with a fresh port whenever this specific error is observed.
|
|
||||||
from torch.distributed import DistNetworkError
|
|
||||||
|
|
||||||
from vllm.distributed.utils import (
|
|
||||||
stateless_init_torch_distributed_process_group)
|
|
||||||
|
|
||||||
max_retries = 5
|
|
||||||
last_exc: Optional[Exception] = None
|
|
||||||
for _ in range(max_retries):
|
|
||||||
try:
|
|
||||||
# use gloo since the engine process might not have cuda device
|
|
||||||
return stateless_init_torch_distributed_process_group(
|
|
||||||
self.data_parallel_master_ip,
|
|
||||||
self.get_next_dp_init_port(),
|
|
||||||
self.data_parallel_rank,
|
|
||||||
self.data_parallel_size,
|
|
||||||
backend="gloo")
|
|
||||||
except DistNetworkError as e:
|
|
||||||
# We only want to retry when the root cause is EADDRINUSE.
|
|
||||||
if "EADDRINUSE" in str(e):
|
|
||||||
logger.warning(
|
|
||||||
"Address already in use. Retrying with a new port.")
|
|
||||||
last_exc = e
|
|
||||||
continue # try again with a new port
|
|
||||||
raise e
|
|
||||||
|
|
||||||
# If we get here all retries have failed.
|
|
||||||
assert last_exc is not None
|
|
||||||
raise last_exc
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def has_unfinished_dp(dp_group: "ProcessGroup",
|
|
||||||
has_unfinished: bool) -> bool:
|
|
||||||
tensor = torch.tensor([has_unfinished],
|
|
||||||
dtype=torch.int32,
|
|
||||||
device="cpu")
|
|
||||||
# dp rank 0: has_unfinished_seqs=True
|
|
||||||
# dp rank 1: has_unfinished_seqs=False
|
|
||||||
# aggregated: has_unfinished_seqs=True
|
|
||||||
# so this is an OR operation, i.e. MAX in integers
|
|
||||||
torch.distributed.all_reduce(tensor, op=ReduceOp.MAX, group=dp_group)
|
|
||||||
aggregated_has_unfinished = bool(tensor.item())
|
|
||||||
return aggregated_has_unfinished
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def sync_kv_cache_memory_size(dp_group: "ProcessGroup",
|
|
||||||
kv_cache_memory: int) -> int:
|
|
||||||
if kv_cache_memory == -1:
|
|
||||||
kv_cache_memory = torch.iinfo(torch.int64).max
|
|
||||||
tensor = torch.tensor([kv_cache_memory],
|
|
||||||
dtype=torch.int64,
|
|
||||||
device="cpu")
|
|
||||||
# we cannot use broadcast for stateless dp group since it depends
|
|
||||||
# on global rank
|
|
||||||
torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group)
|
|
||||||
return tensor.item()
|
|
||||||
|
|
||||||
def compute_hash(self):
|
|
||||||
"""
|
|
||||||
Provide a hash that uniquely identifies all the configs
|
|
||||||
that affect the structure of the computation
|
|
||||||
graph from input ids/embeddings to the final hidden states,
|
|
||||||
excluding anything before input ids/embeddings and after
|
|
||||||
the final hidden states.
|
|
||||||
"""
|
|
||||||
factors: list[Any] = []
|
|
||||||
factors.append(self.pipeline_parallel_size)
|
|
||||||
factors.append(self.tensor_parallel_size)
|
|
||||||
factors.append(self.enable_expert_parallel)
|
|
||||||
factors.append(self.data_parallel_size)
|
|
||||||
factors.append(envs.VLLM_ALL2ALL_BACKEND)
|
|
||||||
return hashlib.sha256(str(factors).encode()).hexdigest()
|
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
|
||||||
self.world_size = self.pipeline_parallel_size * \
|
|
||||||
self.tensor_parallel_size
|
|
||||||
|
|
||||||
if self.data_parallel_size_local > self.data_parallel_size:
|
|
||||||
raise ValueError(
|
|
||||||
f"data_parallel_size_local ({self.data_parallel_size_local}) "
|
|
||||||
f"must be <= data_parallel_size ({self.data_parallel_size})")
|
|
||||||
|
|
||||||
if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
|
|
||||||
# Data parallel was specified in the engine args.
|
|
||||||
self.data_parallel_master_port = get_open_port()
|
|
||||||
|
|
||||||
if not (0 <= self.data_parallel_rank < self.data_parallel_size):
|
|
||||||
raise ValueError(
|
|
||||||
f"data_parallel_rank ({self.data_parallel_rank})"
|
|
||||||
f" must be in the range [0, {self.data_parallel_size})")
|
|
||||||
else:
|
|
||||||
# Otherwise fall back to env vars (e.g. for offline SPMD case).
|
|
||||||
self.data_parallel_size = envs.VLLM_DP_SIZE
|
|
||||||
self.data_parallel_rank = envs.VLLM_DP_RANK
|
|
||||||
self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL
|
|
||||||
self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
|
|
||||||
self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
|
|
||||||
|
|
||||||
if self.data_parallel_external_lb:
|
|
||||||
raise ValueError("data_parallel_external_lb can only "
|
|
||||||
"be set when data_parallel_size > 1")
|
|
||||||
|
|
||||||
if self.distributed_executor_backend == "external_launcher":
|
|
||||||
import os
|
|
||||||
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
|
|
||||||
logger.info("Disabling V1 multiprocessing for external launcher.")
|
|
||||||
|
|
||||||
if self.enable_eplb:
|
|
||||||
if not current_platform.is_cuda():
|
|
||||||
raise ValueError(
|
|
||||||
"Expert parallelism load balancing is only supported on "
|
|
||||||
"CUDA devices now.")
|
|
||||||
if self.num_redundant_experts < 0:
|
|
||||||
raise ValueError(
|
|
||||||
"num_redundant_experts must be non-negative, but got "
|
|
||||||
f"{self.num_redundant_experts}.")
|
|
||||||
if not self.enable_expert_parallel:
|
|
||||||
raise ValueError(
|
|
||||||
"enable_expert_parallel must be True to use EPLB.")
|
|
||||||
if self.tensor_parallel_size * self.data_parallel_size <= 1:
|
|
||||||
raise ValueError(
|
|
||||||
"EPLB requires tensor_parallel_size or data_parallel_size "
|
|
||||||
f"to be greater than 1, but got "
|
|
||||||
f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
if self.num_redundant_experts != 0:
|
|
||||||
raise ValueError(
|
|
||||||
"num_redundant_experts should be used with EPLB."
|
|
||||||
f"{self.num_redundant_experts}.")
|
|
||||||
if self.distributed_executor_backend is None and self.world_size > 1:
|
|
||||||
# We use multiprocessing by default if world_size fits on the
|
|
||||||
# current node and we aren't in a ray placement group.
|
|
||||||
|
|
||||||
from vllm.executor import ray_utils
|
|
||||||
backend: DistributedExecutorBackend = "mp"
|
|
||||||
ray_found = ray_utils.ray_is_available()
|
|
||||||
if current_platform.is_neuron():
|
|
||||||
# neuron uses single process to control multiple devices
|
|
||||||
backend = "uni"
|
|
||||||
elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
|
|
||||||
backend = "uni"
|
|
||||||
elif (current_platform.is_cuda()
|
|
||||||
and cuda_device_count_stateless() < self.world_size):
|
|
||||||
if not ray_found:
|
|
||||||
raise ValueError("Unable to load Ray: "
|
|
||||||
f"{ray_utils.ray_import_err}. Ray is "
|
|
||||||
"required for multi-node inference, "
|
|
||||||
"please install Ray with `pip install "
|
|
||||||
"ray`.")
|
|
||||||
backend = "ray"
|
|
||||||
elif self.data_parallel_backend == "ray":
|
|
||||||
logger.info("Using ray distributed inference because "
|
|
||||||
"data_parallel_backend is ray")
|
|
||||||
backend = "ray"
|
|
||||||
elif ray_found:
|
|
||||||
if self.placement_group:
|
|
||||||
backend = "ray"
|
|
||||||
else:
|
|
||||||
from ray import is_initialized as ray_is_initialized
|
|
||||||
if ray_is_initialized():
|
|
||||||
from ray.util import get_current_placement_group
|
|
||||||
if get_current_placement_group():
|
|
||||||
backend = "ray"
|
|
||||||
self.distributed_executor_backend = backend
|
|
||||||
logger.debug("Defaulting to use %s for distributed inference",
|
|
||||||
backend)
|
|
||||||
|
|
||||||
if self.distributed_executor_backend is None and self.world_size == 1:
|
|
||||||
self.distributed_executor_backend = "uni"
|
|
||||||
|
|
||||||
@property
|
|
||||||
def use_ray(self) -> bool:
|
|
||||||
return self.distributed_executor_backend == "ray" or (
|
|
||||||
isinstance(self.distributed_executor_backend, type)
|
|
||||||
and self.distributed_executor_backend.uses_ray)
|
|
||||||
|
|
||||||
@model_validator(mode='after')
|
|
||||||
def _verify_args(self) -> Self:
|
|
||||||
# Lazy import to avoid circular import
|
|
||||||
from vllm.executor.executor_base import ExecutorBase
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
if self.distributed_executor_backend not in (
|
|
||||||
"ray", "mp", "uni",
|
|
||||||
"external_launcher", None) and not (isinstance(
|
|
||||||
self.distributed_executor_backend, type) and issubclass(
|
|
||||||
self.distributed_executor_backend, ExecutorBase)):
|
|
||||||
raise ValueError(
|
|
||||||
"Unrecognized distributed executor backend "
|
|
||||||
f"{self.distributed_executor_backend}. Supported "
|
|
||||||
"values are 'ray', 'mp' 'uni', 'external_launcher' or"
|
|
||||||
" custom ExecutorBase subclass.")
|
|
||||||
if self.use_ray:
|
|
||||||
from vllm.executor import ray_utils
|
|
||||||
ray_utils.assert_ray_available()
|
|
||||||
|
|
||||||
if not current_platform.use_custom_allreduce():
|
|
||||||
self.disable_custom_all_reduce = True
|
|
||||||
logger.debug(
|
|
||||||
"Disabled the custom all-reduce kernel because it is not "
|
|
||||||
"supported on current platform.")
|
|
||||||
if self.ray_workers_use_nsight and not self.use_ray:
|
|
||||||
raise ValueError("Unable to use nsight profiling unless workers "
|
|
||||||
"run with Ray.")
|
|
||||||
|
|
||||||
return self
|
|
||||||
|
|
||||||
|
|
||||||
PreemptionMode = Literal["swap", "recompute"]
|
PreemptionMode = Literal["swap", "recompute"]
|
||||||
SchedulerPolicy = Literal["fcfs", "priority"]
|
SchedulerPolicy = Literal["fcfs", "priority"]
|
||||||
|
|
||||||
|
|||||||
@ -16,7 +16,7 @@ from vllm.logger import init_logger
|
|||||||
from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
|
from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
else:
|
else:
|
||||||
VllmConfig = object
|
VllmConfig = object
|
||||||
|
|
||||||
|
|||||||
375
vllm/config/parallel.py
Normal file
375
vllm/config/parallel.py
Normal file
@ -0,0 +1,375 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
from dataclasses import field
|
||||||
|
from typing import TYPE_CHECKING, Any, Literal, Optional, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from pydantic import model_validator
|
||||||
|
from pydantic.dataclasses import dataclass
|
||||||
|
from torch.distributed import ProcessGroup, ReduceOp
|
||||||
|
from typing_extensions import Self
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
|
from vllm.config.utils import config
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import cuda_device_count_stateless, get_open_port
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from ray.runtime_env import RuntimeEnv
|
||||||
|
from ray.util.placement_group import PlacementGroup
|
||||||
|
|
||||||
|
from vllm.executor.executor_base import ExecutorBase
|
||||||
|
else:
|
||||||
|
RuntimeEnv = Any
|
||||||
|
PlacementGroup = Any
|
||||||
|
ExecutorBase = Any
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
|
||||||
|
|
||||||
|
|
||||||
|
@config
|
||||||
|
@dataclass
|
||||||
|
class ParallelConfig:
|
||||||
|
"""Configuration for the distributed execution."""
|
||||||
|
|
||||||
|
pipeline_parallel_size: int = 1
|
||||||
|
"""Number of pipeline parallel groups."""
|
||||||
|
tensor_parallel_size: int = 1
|
||||||
|
"""Number of tensor parallel groups."""
|
||||||
|
data_parallel_size: int = 1
|
||||||
|
"""Number of data parallel groups. MoE layers will be sharded according to
|
||||||
|
the product of the tensor parallel size and data parallel size."""
|
||||||
|
data_parallel_size_local: int = 1
|
||||||
|
"""Number of local data parallel groups."""
|
||||||
|
data_parallel_rank: int = 0
|
||||||
|
"""Rank of the data parallel group."""
|
||||||
|
data_parallel_rank_local: Optional[int] = None
|
||||||
|
"""Local rank of the data parallel group,
|
||||||
|
set only in SPMD mode."""
|
||||||
|
data_parallel_master_ip: str = "127.0.0.1"
|
||||||
|
"""IP of the data parallel master."""
|
||||||
|
data_parallel_rpc_port: int = 29550
|
||||||
|
"""Port for data parallel messaging."""
|
||||||
|
data_parallel_master_port: int = 29500
|
||||||
|
"""Port of the data parallel master."""
|
||||||
|
data_parallel_backend: str = "mp"
|
||||||
|
"""Backend to use for data parallel, either "mp" or "ray"."""
|
||||||
|
data_parallel_external_lb: bool = False
|
||||||
|
"""Whether to use "external" DP LB mode. Applies only to online serving
|
||||||
|
and when data_parallel_size > 0. This is useful for a "one-pod-per-rank"
|
||||||
|
wide-EP setup in Kuberentes. Set implicitly when --data-parallel-rank
|
||||||
|
is provided explicitly to vllm serve."""
|
||||||
|
data_parallel_hybrid_lb: bool = False
|
||||||
|
"""Whether to use "hybrid" DP LB mode. Applies only to online serving
|
||||||
|
and when data_parallel_size > 0. Enables running an AsyncLLM
|
||||||
|
and API server on a "per-node" basis where vLLM load balances
|
||||||
|
between local data parallel ranks, but an external LB balances
|
||||||
|
between vLLM nodes/replicas. Set explicitly in conjunction with
|
||||||
|
--data-parallel-start-rank."""
|
||||||
|
enable_expert_parallel: bool = False
|
||||||
|
"""Use expert parallelism instead of tensor parallelism for MoE layers."""
|
||||||
|
enable_eplb: bool = False
|
||||||
|
"""Enable expert parallelism load balancing for MoE layers."""
|
||||||
|
num_redundant_experts: int = 0
|
||||||
|
"""Number of redundant experts to use for expert parallelism."""
|
||||||
|
eplb_window_size: int = 1000
|
||||||
|
"""Window size for expert load recording."""
|
||||||
|
eplb_step_interval: int = 3000
|
||||||
|
"""
|
||||||
|
Interval for rearranging experts in expert parallelism.
|
||||||
|
|
||||||
|
Note that if this is greater than the EPLB window size, only the metrics
|
||||||
|
of the last `eplb_window_size` steps will be used for rearranging experts.
|
||||||
|
"""
|
||||||
|
eplb_log_balancedness: bool = False
|
||||||
|
"""
|
||||||
|
Log the balancedness each step of expert parallelism.
|
||||||
|
This is turned off by default since it will cause communication overhead.
|
||||||
|
"""
|
||||||
|
|
||||||
|
max_parallel_loading_workers: Optional[int] = None
|
||||||
|
"""Maximum number of parallel loading workers when loading model
|
||||||
|
sequentially in multiple batches. To avoid RAM OOM when using tensor
|
||||||
|
parallel and large models."""
|
||||||
|
|
||||||
|
disable_custom_all_reduce: bool = False
|
||||||
|
"""Disable the custom all-reduce kernel and fall back to NCCL."""
|
||||||
|
|
||||||
|
ray_workers_use_nsight: bool = False
|
||||||
|
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
|
||||||
|
|
||||||
|
ray_runtime_env: Optional[RuntimeEnv] = None
|
||||||
|
"""Ray runtime environment to pass to distributed workers."""
|
||||||
|
|
||||||
|
placement_group: Optional[PlacementGroup] = None
|
||||||
|
"""ray distributed model workers placement group."""
|
||||||
|
|
||||||
|
distributed_executor_backend: Optional[Union[DistributedExecutorBackend,
|
||||||
|
type[ExecutorBase]]] = None
|
||||||
|
"""Backend to use for distributed model
|
||||||
|
workers, either "ray" or "mp" (multiprocessing). If the product
|
||||||
|
of pipeline_parallel_size and tensor_parallel_size is less than
|
||||||
|
or equal to the number of GPUs available, "mp" will be used to
|
||||||
|
keep processing on a single host. Otherwise, this will default
|
||||||
|
to "ray" if Ray is installed and fail otherwise. Note that tpu
|
||||||
|
only support Ray for distributed inference."""
|
||||||
|
|
||||||
|
worker_cls: str = "auto"
|
||||||
|
"""The full name of the worker class to use. If "auto", the worker class
|
||||||
|
will be determined based on the platform."""
|
||||||
|
sd_worker_cls: str = "auto"
|
||||||
|
"""The full name of the worker class to use for speculative decoding.
|
||||||
|
If "auto", the worker class will be determined based on the platform."""
|
||||||
|
worker_extension_cls: str = ""
|
||||||
|
"""The full name of the worker extension class to use. The worker extension
|
||||||
|
class is dynamically inherited by the worker class. This is used to inject
|
||||||
|
new attributes and methods to the worker class for use in collective_rpc
|
||||||
|
calls."""
|
||||||
|
|
||||||
|
world_size: int = field(init=False)
|
||||||
|
"""world_size is TPxPP, it affects the number of workers we create."""
|
||||||
|
|
||||||
|
rank: int = 0
|
||||||
|
"""Global rank in distributed setup."""
|
||||||
|
|
||||||
|
enable_multimodal_encoder_data_parallel: bool = False
|
||||||
|
""" Use data parallelism instead of tensor parallelism for vision encoder.
|
||||||
|
Only support LLama4 for now"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def world_size_across_dp(self) -> int:
|
||||||
|
"""world_size_across_dp is TPxPPxDP, it is the size of the world
|
||||||
|
including data parallelism."""
|
||||||
|
return self.world_size * self.data_parallel_size
|
||||||
|
|
||||||
|
def get_next_dp_init_port(self) -> int:
|
||||||
|
"""
|
||||||
|
We might need to initialize process groups in multiple
|
||||||
|
processes that is related to data parallelism,
|
||||||
|
e.g. both in the worker and in the engine, which
|
||||||
|
can live in different processes. To avoid port conflicts, we
|
||||||
|
increment the port number each time we need to initialize a
|
||||||
|
new process group related to data parallelism.
|
||||||
|
"""
|
||||||
|
answer = self.data_parallel_master_port
|
||||||
|
self.data_parallel_master_port += 1
|
||||||
|
return answer
|
||||||
|
|
||||||
|
def stateless_init_dp_group(self) -> ProcessGroup:
|
||||||
|
# NOTE: In high-concurrency scenarios multiple processes
|
||||||
|
# can pick the same (currently free) port through a race
|
||||||
|
# condition when calling `get_open_port()`. When the first
|
||||||
|
# process binds the port the others will subsequently fail
|
||||||
|
# with `torch.distributed.DistNetworkError: EADDRINUSE`.
|
||||||
|
# To make the initialization more robust we retry a few times
|
||||||
|
# with a fresh port whenever this specific error is observed.
|
||||||
|
from torch.distributed import DistNetworkError
|
||||||
|
|
||||||
|
from vllm.distributed.utils import (
|
||||||
|
stateless_init_torch_distributed_process_group)
|
||||||
|
|
||||||
|
max_retries = 5
|
||||||
|
last_exc: Optional[Exception] = None
|
||||||
|
for _ in range(max_retries):
|
||||||
|
try:
|
||||||
|
# use gloo since the engine process might not have cuda device
|
||||||
|
return stateless_init_torch_distributed_process_group(
|
||||||
|
self.data_parallel_master_ip,
|
||||||
|
self.get_next_dp_init_port(),
|
||||||
|
self.data_parallel_rank,
|
||||||
|
self.data_parallel_size,
|
||||||
|
backend="gloo")
|
||||||
|
except DistNetworkError as e:
|
||||||
|
# We only want to retry when the root cause is EADDRINUSE.
|
||||||
|
if "EADDRINUSE" in str(e):
|
||||||
|
logger.warning(
|
||||||
|
"Address already in use. Retrying with a new port.")
|
||||||
|
last_exc = e
|
||||||
|
continue # try again with a new port
|
||||||
|
raise e
|
||||||
|
|
||||||
|
# If we get here all retries have failed.
|
||||||
|
assert last_exc is not None
|
||||||
|
raise last_exc
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def has_unfinished_dp(dp_group: ProcessGroup,
|
||||||
|
has_unfinished: bool) -> bool:
|
||||||
|
tensor = torch.tensor([has_unfinished],
|
||||||
|
dtype=torch.int32,
|
||||||
|
device="cpu")
|
||||||
|
# dp rank 0: has_unfinished_seqs=True
|
||||||
|
# dp rank 1: has_unfinished_seqs=False
|
||||||
|
# aggregated: has_unfinished_seqs=True
|
||||||
|
# so this is an OR operation, i.e. MAX in integers
|
||||||
|
torch.distributed.all_reduce(tensor, op=ReduceOp.MAX, group=dp_group)
|
||||||
|
aggregated_has_unfinished = bool(tensor.item())
|
||||||
|
return aggregated_has_unfinished
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def sync_kv_cache_memory_size(dp_group: ProcessGroup,
|
||||||
|
kv_cache_memory: int) -> int:
|
||||||
|
if kv_cache_memory == -1:
|
||||||
|
kv_cache_memory = torch.iinfo(torch.int64).max
|
||||||
|
tensor = torch.tensor([kv_cache_memory],
|
||||||
|
dtype=torch.int64,
|
||||||
|
device="cpu")
|
||||||
|
# we cannot use broadcast for stateless dp group since it depends
|
||||||
|
# on global rank
|
||||||
|
torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group)
|
||||||
|
return tensor.item()
|
||||||
|
|
||||||
|
def compute_hash(self):
|
||||||
|
"""
|
||||||
|
Provide a hash that uniquely identifies all the configs
|
||||||
|
that affect the structure of the computation
|
||||||
|
graph from input ids/embeddings to the final hidden states,
|
||||||
|
excluding anything before input ids/embeddings and after
|
||||||
|
the final hidden states.
|
||||||
|
"""
|
||||||
|
factors: list[Any] = []
|
||||||
|
factors.append(self.pipeline_parallel_size)
|
||||||
|
factors.append(self.tensor_parallel_size)
|
||||||
|
factors.append(self.enable_expert_parallel)
|
||||||
|
factors.append(self.data_parallel_size)
|
||||||
|
factors.append(envs.VLLM_ALL2ALL_BACKEND)
|
||||||
|
return hashlib.sha256(str(factors).encode()).hexdigest()
|
||||||
|
|
||||||
|
def __post_init__(self) -> None:
|
||||||
|
self.world_size = self.pipeline_parallel_size * \
|
||||||
|
self.tensor_parallel_size
|
||||||
|
|
||||||
|
if self.data_parallel_size_local > self.data_parallel_size:
|
||||||
|
raise ValueError(
|
||||||
|
f"data_parallel_size_local ({self.data_parallel_size_local}) "
|
||||||
|
f"must be <= data_parallel_size ({self.data_parallel_size})")
|
||||||
|
|
||||||
|
if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
|
||||||
|
# Data parallel was specified in the engine args.
|
||||||
|
self.data_parallel_master_port = get_open_port()
|
||||||
|
|
||||||
|
if not (0 <= self.data_parallel_rank < self.data_parallel_size):
|
||||||
|
raise ValueError(
|
||||||
|
f"data_parallel_rank ({self.data_parallel_rank})"
|
||||||
|
f" must be in the range [0, {self.data_parallel_size})")
|
||||||
|
else:
|
||||||
|
# Otherwise fall back to env vars (e.g. for offline SPMD case).
|
||||||
|
self.data_parallel_size = envs.VLLM_DP_SIZE
|
||||||
|
self.data_parallel_rank = envs.VLLM_DP_RANK
|
||||||
|
self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL
|
||||||
|
self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
|
||||||
|
self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
|
||||||
|
|
||||||
|
if self.data_parallel_external_lb:
|
||||||
|
raise ValueError("data_parallel_external_lb can only "
|
||||||
|
"be set when data_parallel_size > 1")
|
||||||
|
|
||||||
|
if self.distributed_executor_backend == "external_launcher":
|
||||||
|
import os
|
||||||
|
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
|
||||||
|
logger.info("Disabling V1 multiprocessing for external launcher.")
|
||||||
|
|
||||||
|
if self.enable_eplb:
|
||||||
|
if not current_platform.is_cuda():
|
||||||
|
raise ValueError(
|
||||||
|
"Expert parallelism load balancing is only supported on "
|
||||||
|
"CUDA devices now.")
|
||||||
|
if self.num_redundant_experts < 0:
|
||||||
|
raise ValueError(
|
||||||
|
"num_redundant_experts must be non-negative, but got "
|
||||||
|
f"{self.num_redundant_experts}.")
|
||||||
|
if not self.enable_expert_parallel:
|
||||||
|
raise ValueError(
|
||||||
|
"enable_expert_parallel must be True to use EPLB.")
|
||||||
|
if self.tensor_parallel_size * self.data_parallel_size <= 1:
|
||||||
|
raise ValueError(
|
||||||
|
"EPLB requires tensor_parallel_size or data_parallel_size "
|
||||||
|
f"to be greater than 1, but got "
|
||||||
|
f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if self.num_redundant_experts != 0:
|
||||||
|
raise ValueError(
|
||||||
|
"num_redundant_experts should be used with EPLB."
|
||||||
|
f"{self.num_redundant_experts}.")
|
||||||
|
if self.distributed_executor_backend is None and self.world_size > 1:
|
||||||
|
# We use multiprocessing by default if world_size fits on the
|
||||||
|
# current node and we aren't in a ray placement group.
|
||||||
|
|
||||||
|
from vllm.executor import ray_utils
|
||||||
|
backend: DistributedExecutorBackend = "mp"
|
||||||
|
ray_found = ray_utils.ray_is_available()
|
||||||
|
if current_platform.is_neuron():
|
||||||
|
# neuron uses single process to control multiple devices
|
||||||
|
backend = "uni"
|
||||||
|
elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
|
||||||
|
backend = "uni"
|
||||||
|
elif (current_platform.is_cuda()
|
||||||
|
and cuda_device_count_stateless() < self.world_size):
|
||||||
|
if not ray_found:
|
||||||
|
raise ValueError("Unable to load Ray: "
|
||||||
|
f"{ray_utils.ray_import_err}. Ray is "
|
||||||
|
"required for multi-node inference, "
|
||||||
|
"please install Ray with `pip install "
|
||||||
|
"ray`.")
|
||||||
|
backend = "ray"
|
||||||
|
elif self.data_parallel_backend == "ray":
|
||||||
|
logger.info("Using ray distributed inference because "
|
||||||
|
"data_parallel_backend is ray")
|
||||||
|
backend = "ray"
|
||||||
|
elif ray_found:
|
||||||
|
if self.placement_group:
|
||||||
|
backend = "ray"
|
||||||
|
else:
|
||||||
|
from ray import is_initialized as ray_is_initialized
|
||||||
|
if ray_is_initialized():
|
||||||
|
from ray.util import get_current_placement_group
|
||||||
|
if get_current_placement_group():
|
||||||
|
backend = "ray"
|
||||||
|
self.distributed_executor_backend = backend
|
||||||
|
logger.debug("Defaulting to use %s for distributed inference",
|
||||||
|
backend)
|
||||||
|
|
||||||
|
if self.distributed_executor_backend is None and self.world_size == 1:
|
||||||
|
self.distributed_executor_backend = "uni"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def use_ray(self) -> bool:
|
||||||
|
return self.distributed_executor_backend == "ray" or (
|
||||||
|
isinstance(self.distributed_executor_backend, type)
|
||||||
|
and self.distributed_executor_backend.uses_ray)
|
||||||
|
|
||||||
|
@model_validator(mode='after')
|
||||||
|
def _verify_args(self) -> Self:
|
||||||
|
# Lazy import to avoid circular import
|
||||||
|
from vllm.executor.executor_base import ExecutorBase
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
if self.distributed_executor_backend not in (
|
||||||
|
"ray", "mp", "uni",
|
||||||
|
"external_launcher", None) and not (isinstance(
|
||||||
|
self.distributed_executor_backend, type) and issubclass(
|
||||||
|
self.distributed_executor_backend, ExecutorBase)):
|
||||||
|
raise ValueError(
|
||||||
|
"Unrecognized distributed executor backend "
|
||||||
|
f"{self.distributed_executor_backend}. Supported "
|
||||||
|
"values are 'ray', 'mp' 'uni', 'external_launcher' or"
|
||||||
|
" custom ExecutorBase subclass.")
|
||||||
|
if self.use_ray:
|
||||||
|
from vllm.executor import ray_utils
|
||||||
|
ray_utils.assert_ray_available()
|
||||||
|
|
||||||
|
if not current_platform.use_custom_allreduce():
|
||||||
|
self.disable_custom_all_reduce = True
|
||||||
|
logger.debug(
|
||||||
|
"Disabled the custom all-reduce kernel because it is not "
|
||||||
|
"supported on current platform.")
|
||||||
|
if self.ray_workers_use_nsight and not self.use_ray:
|
||||||
|
raise ValueError("Unable to use nsight profiling unless workers "
|
||||||
|
"run with Ray.")
|
||||||
|
|
||||||
|
return self
|
||||||
Loading…
x
Reference in New Issue
Block a user