mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-04 18:35:43 +08:00
more fixes
Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
parent
539c0c3add
commit
5f4a501b9a
@ -45,6 +45,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
expert_map: Optional[torch.Tensor],
|
expert_map: Optional[torch.Tensor],
|
||||||
apply_router_weight_on_input: bool,
|
apply_router_weight_on_input: bool,
|
||||||
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
|
) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
|
||||||
|
assert False
|
||||||
num_tokens = a1.size(0) # M
|
num_tokens = a1.size(0) # M
|
||||||
hidden_dim = a1.size(-1) # K
|
hidden_dim = a1.size(-1) # K
|
||||||
ubatch_ctx = get_current_ubatch_context()
|
ubatch_ctx = get_current_ubatch_context()
|
||||||
@ -144,6 +145,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
topk_ids: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
apply_router_weight_on_input: bool,
|
apply_router_weight_on_input: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
assert False
|
||||||
num_tokens = output.size(0) # M
|
num_tokens = output.size(0) # M
|
||||||
# This argument is optional
|
# This argument is optional
|
||||||
# There's not much point setting this unless it is != topk_ids.size(0)
|
# There's not much point setting this unless it is != topk_ids.size(0)
|
||||||
|
|||||||
@ -1200,6 +1200,31 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
for k, v in self.intermediate_tensors.items()
|
for k, v in self.intermediate_tensors.items()
|
||||||
})
|
})
|
||||||
|
|
||||||
|
def get_dp_padding(self,
|
||||||
|
num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
|
||||||
|
dp_size = self.vllm_config.parallel_config.data_parallel_size
|
||||||
|
dp_rank = self.vllm_config.parallel_config.data_parallel_rank
|
||||||
|
|
||||||
|
# For DP: Don't pad when setting enforce_eager.
|
||||||
|
# This lets us set enforce_eager on the prefiller in a P/D setup and
|
||||||
|
# still use CUDA graphs (enabled by this padding) on the decoder.
|
||||||
|
#
|
||||||
|
# TODO(tms) : There are many cases where padding is enabled for
|
||||||
|
# prefills, causing unnecessary and excessive padding of activations.
|
||||||
|
|
||||||
|
if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
|
||||||
|
# Early exit.
|
||||||
|
return 0, None
|
||||||
|
|
||||||
|
num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
|
||||||
|
num_tokens, dp_size, dp_rank)
|
||||||
|
max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
|
||||||
|
num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] *
|
||||||
|
dp_size,
|
||||||
|
device="cpu",
|
||||||
|
dtype=torch.int32)
|
||||||
|
return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding
|
||||||
|
|
||||||
def _get_dummy_model_inputs(self, num_tokens: int) -> tuple:
|
def _get_dummy_model_inputs(self, num_tokens: int) -> tuple:
|
||||||
# Dummy batch. (hopefully we are the last one so we can just
|
# Dummy batch. (hopefully we are the last one so we can just
|
||||||
# update this to a one token batch and return)
|
# update this to a one token batch and return)
|
||||||
@ -1306,7 +1331,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
num_scheduled_tokens: Optional[int],
|
num_scheduled_tokens: Optional[int],
|
||||||
ubatch_slices: Optional[UBatchSlices] = None,
|
ubatch_slices: Optional[UBatchSlices] = None,
|
||||||
scheduler_output: Optional["SchedulerOutput"] = None,
|
scheduler_output: Optional["SchedulerOutput"] = None,
|
||||||
is_dummy_run: bool = False):
|
is_dummy_run: bool = False,
|
||||||
|
num_tokens_across_dp: Optional[torch.Tensor] = None):
|
||||||
|
|
||||||
num_dummy_tokens = num_scheduled_tokens if is_dummy_run else 1
|
num_dummy_tokens = num_scheduled_tokens if is_dummy_run else 1
|
||||||
|
|
||||||
@ -1367,7 +1393,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
attn_metadata[i]
|
attn_metadata[i]
|
||||||
if attn_metadata is not None else None,
|
if attn_metadata is not None else None,
|
||||||
self.vllm_config,
|
self.vllm_config,
|
||||||
num_tokens=num_tokens)
|
num_tokens=num_tokens,
|
||||||
|
num_tokens_across_dp=num_tokens_across_dp)
|
||||||
|
|
||||||
thread = threading.Thread(target=_ubatch_thread,
|
thread = threading.Thread(target=_ubatch_thread,
|
||||||
args=(
|
args=(
|
||||||
@ -1400,36 +1427,12 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
slice(0, num_scheduled_tokens),
|
slice(0, num_scheduled_tokens),
|
||||||
set_forward_context(attn_metadata,
|
set_forward_context(attn_metadata,
|
||||||
vllm_config=self.vllm_config,
|
vllm_config=self.vllm_config,
|
||||||
num_tokens=num_scheduled_tokens or 1),
|
num_tokens=num_scheduled_tokens or 1,
|
||||||
|
num_tokens_across_dp=num_tokens_across_dp),
|
||||||
is_dummy_run)
|
is_dummy_run)
|
||||||
|
|
||||||
return model_output
|
return model_output
|
||||||
|
|
||||||
def get_dp_padding(self,
|
|
||||||
num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
|
|
||||||
dp_size = self.vllm_config.parallel_config.data_parallel_size
|
|
||||||
dp_rank = self.vllm_config.parallel_config.data_parallel_rank
|
|
||||||
|
|
||||||
# For DP: Don't pad when setting enforce_eager.
|
|
||||||
# This lets us set enforce_eager on the prefiller in a P/D setup and
|
|
||||||
# still use CUDA graphs (enabled by this padding) on the decoder.
|
|
||||||
#
|
|
||||||
# TODO(tms) : There are many cases where padding is enabled for
|
|
||||||
# prefills, causing unnecessary and excessive padding of activations.
|
|
||||||
|
|
||||||
if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
|
|
||||||
# Early exit.
|
|
||||||
return 0, None
|
|
||||||
|
|
||||||
num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
|
|
||||||
num_tokens, dp_size, dp_rank)
|
|
||||||
max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
|
|
||||||
num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] *
|
|
||||||
dp_size,
|
|
||||||
device="cpu",
|
|
||||||
dtype=torch.int32)
|
|
||||||
return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def execute_model(
|
def execute_model(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user