diff --git a/examples/basic-ub.py b/examples/basic-ub.py index 397f586b6598b..3f6fd2fdb82ea 100644 --- a/examples/basic-ub.py +++ b/examples/basic-ub.py @@ -40,11 +40,11 @@ def main(): max_model_len=1024, #load_format="dummy", ############### - tensor_parallel_size=1, - #data_parallel_size=2, - enable_expert_parallel=False, + #tensor_parallel_size=1, + data_parallel_size=2, + enable_expert_parallel=True, ############### - enable_microbatching=True, + #enable_microbatching=True, ) # Generate texts from the prompts. # The output is a list of RequestOutput objects diff --git a/vllm/config.py b/vllm/config.py index 42662ef9e6671..08de3fe32710e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4332,7 +4332,7 @@ class VllmConfig: logger.warning_once( "Piecewise compilation is not supported with " "microbatching. Disabling piecewiseching compilation.") - self.compilation_config.level = CompilationLevel.DYNAMO_ONCE + self.compilation_config.level = CompilationLevel.NO_COMPILATION if self.model_config and self.model_config.use_mla and \ diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 621eb8bdd9c77..b2774382ef6f0 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -7,6 +7,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.utils import ( moe_kernel_quantize_input) +from vllm.v1.worker.ubatching import get_current_ubatch_context, yield_impl # Note use: layer.get_all_to_all() to get an AllToAll instance @@ -117,7 +118,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): do_send=send, do_recv=not send, ) + + # if ubatch_ctx is not None: + # ubatch_ctx.gpu_stream_wait() dispatch(True) # Send + yield_impl(gpu_wait=False) dispatch(False) # Recv return expert_x, expert_x_scale, expert_num_tokens @@ -155,5 +160,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): do_send=send, do_recv=not send, ) + # if ubatch_ctx is not None: + # ubatch_ctx.gpu_stream_wait() combine(True) + yield_impl(gpu_wait=False) combine(False) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index b78c193c1345a..1cb22a109dacc 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -49,6 +49,7 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from vllm.v1.worker.ubatching import get_current_ubatch_context from .interfaces import SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, @@ -656,6 +657,9 @@ class DeepseekV2Model(nn.Module): intermediate_tensors: Optional[IntermediateTensors], inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: + if ubatch_ctx := get_current_ubatch_context() is not None: + print("in forward, ubatch:", ubatch_ctx.id) + if get_pp_group().is_first_rank: if inputs_embeds is not None: hidden_states = inputs_embeds