diff --git a/vllm/config.py b/vllm/config.py
index d0891d670b76d..40c87fa44f020 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1752,6 +1752,18 @@ class ParallelConfig:
     disable_custom_all_reduce: bool = False
     """Disable the custom all-reduce kernel and fall back to NCCL."""
 
+    enable_microbatching: bool = False
+    """Enable microbatching for the model executor."""
+
+    always_microbatch_if_enabled: bool = True
+    """Always microbatch if microbatching is enabled. Easier to sync between
+       dp workers."""
+
+    microbatching_token_threshold: int = 4
+    """The threshold for microbatching. If the number of tokens in the
+    request is greater than this threshold, microbatching will be used.
+    Otherwise, the request will be processed in a single batch."""
+
     tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
     """This parameter is deprecated and will be removed in a future release.
     Please remove it from your configs"""
@@ -4434,6 +4446,15 @@ class VllmConfig:
             self.model_config.disable_cascade_attn = True
             self.cache_config.enable_prefix_caching = False
 
+        if self.parallel_config.enable_microbatching and \
+            self.compilation_config.level >= CompilationLevel.PIECEWISE:
+            # Microbatching is not supported with piecewise compilation yet.
+            #  More specifically piecewise cuda-graphs
+            logger.warning_once(
+                "Piecewise compilation is not supported with "
+                "microbatching. Disabling piecewiseching compilation.")
+            self.compilation_config.level = CompilationLevel.NO_COMPILATION
+
         if (self.kv_events_config is not None
                 and self.kv_events_config.enable_kv_cache_events
                 and not self.cache_config.enable_prefix_caching):
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 6ed6015ab2f7f..7281d1906b32a 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -109,7 +109,8 @@ def get_forward_context() -> ForwardContext:
 def create_forward_context(attn_metadata: Any,
                            vllm_config: VllmConfig,
                            virtual_engine: int = 0,
-                           num_tokens: int = 0):
+                           num_tokens: Optional[int] = None,
+                           num_tokens_across_dp: Optional[torch.Tensor] = None):
     dp_metadata: Optional[DPMetadata] = None
     if vllm_config.parallel_config.data_parallel_size > 1 and (
             attn_metadata is not None or num_tokens is not None):
@@ -143,7 +144,8 @@ def override_forward_context(forward_context: Optional[ForwardContext]):
 def set_forward_context(attn_metadata: Any,
                         vllm_config: VllmConfig,
                         virtual_engine: int = 0,
-                        num_tokens: int = 0):
+                        num_tokens: Optional[int] = None,
+                        num_tokens_across_dp: Optional[torch.Tensor] = None):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
@@ -154,7 +156,8 @@ def set_forward_context(attn_metadata: Any,
         forward_start_time = time.perf_counter()
 
     forward_context = create_forward_context(attn_metadata, vllm_config,
-                                             virtual_engine, num_tokens)
+                                             virtual_engine, num_tokens, 
+                                             num_tokens_across_dp)
 
     try:
         with override_forward_context(forward_context):
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4e4e34a16bce3..f01532d37805c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -732,7 +732,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                             ))
                     for layer_name in kv_cache_group_spec.layer_names:
                         assert type(attn_metadata) is list
-                        assert attn_metadata_i is not None
+                        # assert attn_metadata_i is not None
                         # What if it's None? Do we still add it to the list?
                         attn_metadata[ubid][layer_name] = attn_metadata_i
             else:
@@ -1312,7 +1312,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
 
         def model_inputs(tokens_slice: slice, use_dummy_input: bool) -> tuple:
             if use_dummy_input:
-                assert num_dummy_tokens == 1
+                # assert num_dummy_tokens == 1
                 return self._get_dummy_model_inputs(num_dummy_tokens)
             else:
                 assert scheduler_output is not None