first round of fixes

Signed-off-by: Sage Moore <sage@neuralmagic.com>
2026-07-06 18:17:10 +08:00 · 2025-06-03 02:38:44 +00:00 · 2025-06-03 02:38:44 +00:00 · 539c0c3add
commit 539c0c3add
parent 18e7d6c7b8
3 changed files with 29 additions and 5 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@ -1752,6 +1752,18 @@ class ParallelConfig:
    disable_custom_all_reduce: bool = False
    """Disable the custom all-reduce kernel and fall back to NCCL."""

+    enable_microbatching: bool = False
+    """Enable microbatching for the model executor."""
+
+    always_microbatch_if_enabled: bool = True
+    """Always microbatch if microbatching is enabled. Easier to sync between
+       dp workers."""
+
+    microbatching_token_threshold: int = 4
+    """The threshold for microbatching. If the number of tokens in the
+    request is greater than this threshold, microbatching will be used.
+    Otherwise, the request will be processed in a single batch."""
+
    tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
    """This parameter is deprecated and will be removed in a future release.
    Please remove it from your configs"""
@ -4434,6 +4446,15 @@ class VllmConfig:
            self.model_config.disable_cascade_attn = True
            self.cache_config.enable_prefix_caching = False

+        if self.parallel_config.enable_microbatching and \
+            self.compilation_config.level >= CompilationLevel.PIECEWISE:
+            # Microbatching is not supported with piecewise compilation yet.
+            #  More specifically piecewise cuda-graphs
+            logger.warning_once(
+                "Piecewise compilation is not supported with "
+                "microbatching. Disabling piecewiseching compilation.")
+            self.compilation_config.level = CompilationLevel.NO_COMPILATION
+
        if (self.kv_events_config is not None
                and self.kv_events_config.enable_kv_cache_events
                and not self.cache_config.enable_prefix_caching):
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@ -109,7 +109,8 @@ def get_forward_context() -> ForwardContext:
 def create_forward_context(attn_metadata: Any,
                           vllm_config: VllmConfig,
                           virtual_engine: int = 0,
-                           num_tokens: int = 0):
+                           num_tokens: Optional[int] = None,
+                           num_tokens_across_dp: Optional[torch.Tensor] = None):
    dp_metadata: Optional[DPMetadata] = None
    if vllm_config.parallel_config.data_parallel_size > 1 and (
            attn_metadata is not None or num_tokens is not None):
@ -143,7 +144,8 @@ def override_forward_context(forward_context: Optional[ForwardContext]):
 def set_forward_context(attn_metadata: Any,
                        vllm_config: VllmConfig,
                        virtual_engine: int = 0,
-                        num_tokens: int = 0):
+                        num_tokens: Optional[int] = None,
+                        num_tokens_across_dp: Optional[torch.Tensor] = None):
    """A context manager that stores the current forward context,
    can be attention metadata, etc.
    Here we can inject common logic for every model forward pass.
@ -154,7 +156,8 @@ def set_forward_context(attn_metadata: Any,
        forward_start_time = time.perf_counter()

    forward_context = create_forward_context(attn_metadata, vllm_config,
-                                             virtual_engine, num_tokens)
+                                             virtual_engine, num_tokens, 
+                                             num_tokens_across_dp)

    try:
        with override_forward_context(forward_context):
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -732,7 +732,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                            ))
                    for layer_name in kv_cache_group_spec.layer_names:
                        assert type(attn_metadata) is list
-                        assert attn_metadata_i is not None
+                        # assert attn_metadata_i is not None
                        # What if it's None? Do we still add it to the list?
                        attn_metadata[ubid][layer_name] = attn_metadata_i
            else:
@ -1312,7 +1312,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):

        def model_inputs(tokens_slice: slice, use_dummy_input: bool) -> tuple:
            if use_dummy_input:
-                assert num_dummy_tokens == 1
+                # assert num_dummy_tokens == 1
                return self._get_dummy_model_inputs(num_dummy_tokens)
            else:
                assert scheduler_output is not None