first round of fixes

Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
Sage Moore 2025-06-03 02:38:44 +00:00
parent 18e7d6c7b8
commit 539c0c3add
3 changed files with 29 additions and 5 deletions

View File

@ -1752,6 +1752,18 @@ class ParallelConfig:
disable_custom_all_reduce: bool = False
"""Disable the custom all-reduce kernel and fall back to NCCL."""
enable_microbatching: bool = False
"""Enable microbatching for the model executor."""
always_microbatch_if_enabled: bool = True
"""Always microbatch if microbatching is enabled. Easier to sync between
dp workers."""
microbatching_token_threshold: int = 4
"""The threshold for microbatching. If the number of tokens in the
request is greater than this threshold, microbatching will be used.
Otherwise, the request will be processed in a single batch."""
tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
"""This parameter is deprecated and will be removed in a future release.
Please remove it from your configs"""
@ -4434,6 +4446,15 @@ class VllmConfig:
self.model_config.disable_cascade_attn = True
self.cache_config.enable_prefix_caching = False
if self.parallel_config.enable_microbatching and \
self.compilation_config.level >= CompilationLevel.PIECEWISE:
# Microbatching is not supported with piecewise compilation yet.
# More specifically piecewise cuda-graphs
logger.warning_once(
"Piecewise compilation is not supported with "
"microbatching. Disabling piecewiseching compilation.")
self.compilation_config.level = CompilationLevel.NO_COMPILATION
if (self.kv_events_config is not None
and self.kv_events_config.enable_kv_cache_events
and not self.cache_config.enable_prefix_caching):

View File

@ -109,7 +109,8 @@ def get_forward_context() -> ForwardContext:
def create_forward_context(attn_metadata: Any,
vllm_config: VllmConfig,
virtual_engine: int = 0,
num_tokens: int = 0):
num_tokens: Optional[int] = None,
num_tokens_across_dp: Optional[torch.Tensor] = None):
dp_metadata: Optional[DPMetadata] = None
if vllm_config.parallel_config.data_parallel_size > 1 and (
attn_metadata is not None or num_tokens is not None):
@ -143,7 +144,8 @@ def override_forward_context(forward_context: Optional[ForwardContext]):
def set_forward_context(attn_metadata: Any,
vllm_config: VllmConfig,
virtual_engine: int = 0,
num_tokens: int = 0):
num_tokens: Optional[int] = None,
num_tokens_across_dp: Optional[torch.Tensor] = None):
"""A context manager that stores the current forward context,
can be attention metadata, etc.
Here we can inject common logic for every model forward pass.
@ -154,7 +156,8 @@ def set_forward_context(attn_metadata: Any,
forward_start_time = time.perf_counter()
forward_context = create_forward_context(attn_metadata, vllm_config,
virtual_engine, num_tokens)
virtual_engine, num_tokens,
num_tokens_across_dp)
try:
with override_forward_context(forward_context):

View File

@ -732,7 +732,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
))
for layer_name in kv_cache_group_spec.layer_names:
assert type(attn_metadata) is list
assert attn_metadata_i is not None
# assert attn_metadata_i is not None
# What if it's None? Do we still add it to the list?
attn_metadata[ubid][layer_name] = attn_metadata_i
else:
@ -1312,7 +1312,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
def model_inputs(tokens_slice: slice, use_dummy_input: bool) -> tuple:
if use_dummy_input:
assert num_dummy_tokens == 1
# assert num_dummy_tokens == 1
return self._get_dummy_model_inputs(num_dummy_tokens)
else:
assert scheduler_output is not None