diff --git a/vllm/config.py b/vllm/config.py index d0891d670b76d..40c87fa44f020 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1752,6 +1752,18 @@ class ParallelConfig: disable_custom_all_reduce: bool = False """Disable the custom all-reduce kernel and fall back to NCCL.""" + enable_microbatching: bool = False + """Enable microbatching for the model executor.""" + + always_microbatch_if_enabled: bool = True + """Always microbatch if microbatching is enabled. Easier to sync between + dp workers.""" + + microbatching_token_threshold: int = 4 + """The threshold for microbatching. If the number of tokens in the + request is greater than this threshold, microbatching will be used. + Otherwise, the request will be processed in a single batch.""" + tokenizer_pool_config: Optional[TokenizerPoolConfig] = None """This parameter is deprecated and will be removed in a future release. Please remove it from your configs""" @@ -4434,6 +4446,15 @@ class VllmConfig: self.model_config.disable_cascade_attn = True self.cache_config.enable_prefix_caching = False + if self.parallel_config.enable_microbatching and \ + self.compilation_config.level >= CompilationLevel.PIECEWISE: + # Microbatching is not supported with piecewise compilation yet. + # More specifically piecewise cuda-graphs + logger.warning_once( + "Piecewise compilation is not supported with " + "microbatching. Disabling piecewiseching compilation.") + self.compilation_config.level = CompilationLevel.NO_COMPILATION + if (self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events and not self.cache_config.enable_prefix_caching): diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 6ed6015ab2f7f..7281d1906b32a 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -109,7 +109,8 @@ def get_forward_context() -> ForwardContext: def create_forward_context(attn_metadata: Any, vllm_config: VllmConfig, virtual_engine: int = 0, - num_tokens: int = 0): + num_tokens: Optional[int] = None, + num_tokens_across_dp: Optional[torch.Tensor] = None): dp_metadata: Optional[DPMetadata] = None if vllm_config.parallel_config.data_parallel_size > 1 and ( attn_metadata is not None or num_tokens is not None): @@ -143,7 +144,8 @@ def override_forward_context(forward_context: Optional[ForwardContext]): def set_forward_context(attn_metadata: Any, vllm_config: VllmConfig, virtual_engine: int = 0, - num_tokens: int = 0): + num_tokens: Optional[int] = None, + num_tokens_across_dp: Optional[torch.Tensor] = None): """A context manager that stores the current forward context, can be attention metadata, etc. Here we can inject common logic for every model forward pass. @@ -154,7 +156,8 @@ def set_forward_context(attn_metadata: Any, forward_start_time = time.perf_counter() forward_context = create_forward_context(attn_metadata, vllm_config, - virtual_engine, num_tokens) + virtual_engine, num_tokens, + num_tokens_across_dp) try: with override_forward_context(forward_context): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4e4e34a16bce3..f01532d37805c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -732,7 +732,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): )) for layer_name in kv_cache_group_spec.layer_names: assert type(attn_metadata) is list - assert attn_metadata_i is not None + # assert attn_metadata_i is not None # What if it's None? Do we still add it to the list? attn_metadata[ubid][layer_name] = attn_metadata_i else: @@ -1312,7 +1312,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): def model_inputs(tokens_slice: slice, use_dummy_input: bool) -> tuple: if use_dummy_input: - assert num_dummy_tokens == 1 + # assert num_dummy_tokens == 1 return self._get_dummy_model_inputs(num_dummy_tokens) else: assert scheduler_output is not None