mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 07:45:01 +08:00
[Misc] reuse num_tokens_across_dp of get_dp_padding to avoid unnecessary dp all reduce in set_forward_context (#18935)
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com> Co-authored-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
This commit is contained in:
parent
432ec9926e
commit
d6fd3a33b8
@ -47,8 +47,12 @@ class DPMetadata:
|
|||||||
return num_tokens_tensor
|
return num_tokens_tensor
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make(parallel_config: ParallelConfig, attn_metadata: Any,
|
def make(
|
||||||
num_tokens: int) -> "DPMetadata":
|
parallel_config: ParallelConfig,
|
||||||
|
attn_metadata: Any,
|
||||||
|
num_tokens: int,
|
||||||
|
num_tokens_across_dp: Optional[torch.Tensor] = None
|
||||||
|
) -> "DPMetadata":
|
||||||
|
|
||||||
assert parallel_config.data_parallel_size > 1
|
assert parallel_config.data_parallel_size > 1
|
||||||
dp_size = parallel_config.data_parallel_size
|
dp_size = parallel_config.data_parallel_size
|
||||||
@ -62,10 +66,15 @@ class DPMetadata:
|
|||||||
# for v1 attention backends or no attn_metadata
|
# for v1 attention backends or no attn_metadata
|
||||||
batchsize = num_tokens
|
batchsize = num_tokens
|
||||||
|
|
||||||
num_tokens_tensor = DPMetadata.num_tokens_across_dp(
|
# If num_tokens_across_dp is None, it will be computed by all_reduce
|
||||||
batchsize, dp_size, dp_rank)
|
# Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
|
||||||
max_tokens_across_dp_cpu = torch.max(num_tokens_tensor)
|
assert (num_tokens_across_dp is None
|
||||||
cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0)
|
or num_tokens_across_dp[dp_rank] == batchsize)
|
||||||
|
if num_tokens_across_dp is None:
|
||||||
|
num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
|
||||||
|
batchsize, dp_size, dp_rank)
|
||||||
|
max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp)
|
||||||
|
cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_across_dp, dim=0)
|
||||||
return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu)
|
return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu)
|
||||||
|
|
||||||
|
|
||||||
@ -101,7 +110,8 @@ def get_forward_context() -> ForwardContext:
|
|||||||
def set_forward_context(attn_metadata: Any,
|
def set_forward_context(attn_metadata: Any,
|
||||||
vllm_config: VllmConfig,
|
vllm_config: VllmConfig,
|
||||||
virtual_engine: int = 0,
|
virtual_engine: int = 0,
|
||||||
num_tokens: Optional[int] = None):
|
num_tokens: Optional[int] = None,
|
||||||
|
num_tokens_across_dp: Optional[torch.Tensor] = None):
|
||||||
"""A context manager that stores the current forward context,
|
"""A context manager that stores the current forward context,
|
||||||
can be attention metadata, etc.
|
can be attention metadata, etc.
|
||||||
Here we can inject common logic for every model forward pass.
|
Here we can inject common logic for every model forward pass.
|
||||||
@ -114,7 +124,8 @@ def set_forward_context(attn_metadata: Any,
|
|||||||
if vllm_config.parallel_config.data_parallel_size > 1 and (
|
if vllm_config.parallel_config.data_parallel_size > 1 and (
|
||||||
attn_metadata is not None or num_tokens is not None):
|
attn_metadata is not None or num_tokens is not None):
|
||||||
dp_metadata = DPMetadata.make(vllm_config.parallel_config,
|
dp_metadata = DPMetadata.make(vllm_config.parallel_config,
|
||||||
attn_metadata, num_tokens or 0)
|
attn_metadata, num_tokens or 0,
|
||||||
|
num_tokens_across_dp)
|
||||||
|
|
||||||
global _forward_context
|
global _forward_context
|
||||||
prev_context = _forward_context
|
prev_context = _forward_context
|
||||||
|
|||||||
@ -1111,17 +1111,30 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
for k, v in self.intermediate_tensors.items()
|
for k, v in self.intermediate_tensors.items()
|
||||||
})
|
})
|
||||||
|
|
||||||
def get_dp_padding(self, num_tokens: int):
|
def get_dp_padding(self,
|
||||||
|
num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
|
||||||
dp_size = self.vllm_config.parallel_config.data_parallel_size
|
dp_size = self.vllm_config.parallel_config.data_parallel_size
|
||||||
dp_rank = self.vllm_config.parallel_config.data_parallel_rank
|
dp_rank = self.vllm_config.parallel_config.data_parallel_rank
|
||||||
if dp_size == 1:
|
|
||||||
|
# For DP: Don't pad when setting enforce_eager.
|
||||||
|
# This lets us set enforce_eager on the prefiller in a P/D setup and
|
||||||
|
# still use CUDA graphs (enabled by this padding) on the decoder.
|
||||||
|
#
|
||||||
|
# TODO(tms) : There are many cases where padding is enabled for
|
||||||
|
# prefills, causing unnecessary and excessive padding of activations.
|
||||||
|
|
||||||
|
if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
|
||||||
# Early exit.
|
# Early exit.
|
||||||
return 0
|
return 0, None
|
||||||
|
|
||||||
num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
|
num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
|
||||||
num_tokens, dp_size, dp_rank)
|
num_tokens, dp_size, dp_rank)
|
||||||
max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
|
max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
|
||||||
return max_tokens_across_dp_cpu - num_tokens
|
num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] *
|
||||||
|
dp_size,
|
||||||
|
device="cpu",
|
||||||
|
dtype=torch.int32)
|
||||||
|
return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def execute_model(
|
def execute_model(
|
||||||
@ -1161,7 +1174,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
num_input_tokens = num_scheduled_tokens
|
num_input_tokens = num_scheduled_tokens
|
||||||
|
|
||||||
# Padding for DP
|
# Padding for DP
|
||||||
num_input_tokens += self.get_dp_padding(num_input_tokens)
|
num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
|
||||||
|
num_input_tokens += num_pad
|
||||||
|
|
||||||
# _prepare_inputs may reorder the batch, so we must gather multi
|
# _prepare_inputs may reorder the batch, so we must gather multi
|
||||||
# modal outputs after that to ensure the correct order
|
# modal outputs after that to ensure the correct order
|
||||||
@ -1208,7 +1222,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
# Use persistent buffers for CUDA graphs.
|
# Use persistent buffers for CUDA graphs.
|
||||||
with set_forward_context(attn_metadata,
|
with set_forward_context(attn_metadata,
|
||||||
self.vllm_config,
|
self.vllm_config,
|
||||||
num_tokens=num_input_tokens):
|
num_tokens=num_input_tokens,
|
||||||
|
num_tokens_across_dp=num_tokens_across_dp):
|
||||||
self.maybe_setup_kv_connector(scheduler_output)
|
self.maybe_setup_kv_connector(scheduler_output)
|
||||||
|
|
||||||
model_output = self.model(
|
model_output = self.model(
|
||||||
@ -1681,7 +1696,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
|
|
||||||
# Padding for DP
|
# Padding for DP
|
||||||
num_tokens += self.get_dp_padding(num_tokens)
|
num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
|
||||||
|
num_tokens += num_pad
|
||||||
|
|
||||||
# Set num_scheduled_tokens based on num_tokens and max_num_seqs
|
# Set num_scheduled_tokens based on num_tokens and max_num_seqs
|
||||||
# for dummy run with LoRA so that the num_reqs collectively
|
# for dummy run with LoRA so that the num_reqs collectively
|
||||||
@ -1747,9 +1763,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
intermediate_tensors = self.sync_and_slice_intermediate_tensors(
|
intermediate_tensors = self.sync_and_slice_intermediate_tensors(
|
||||||
num_tokens, None, False)
|
num_tokens, None, False)
|
||||||
|
|
||||||
with set_forward_context(attn_metadata,
|
with set_forward_context(
|
||||||
self.vllm_config,
|
attn_metadata,
|
||||||
num_tokens=num_tokens):
|
self.vllm_config,
|
||||||
|
num_tokens=num_tokens,
|
||||||
|
num_tokens_across_dp=num_tokens_across_dp):
|
||||||
outputs = model(
|
outputs = model(
|
||||||
input_ids=input_ids,
|
input_ids=input_ids,
|
||||||
positions=positions,
|
positions=positions,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user