first round of fixes

Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
Sage Moore 2025-06-03 02:38:44 +00:00
parent 18e7d6c7b8
commit 539c0c3add
3 changed files with 29 additions and 5 deletions

View File

@ -1752,6 +1752,18 @@ class ParallelConfig:
disable_custom_all_reduce: bool = False disable_custom_all_reduce: bool = False
"""Disable the custom all-reduce kernel and fall back to NCCL.""" """Disable the custom all-reduce kernel and fall back to NCCL."""
enable_microbatching: bool = False
"""Enable microbatching for the model executor."""
always_microbatch_if_enabled: bool = True
"""Always microbatch if microbatching is enabled. Easier to sync between
dp workers."""
microbatching_token_threshold: int = 4
"""The threshold for microbatching. If the number of tokens in the
request is greater than this threshold, microbatching will be used.
Otherwise, the request will be processed in a single batch."""
tokenizer_pool_config: Optional[TokenizerPoolConfig] = None tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
"""This parameter is deprecated and will be removed in a future release. """This parameter is deprecated and will be removed in a future release.
Please remove it from your configs""" Please remove it from your configs"""
@ -4434,6 +4446,15 @@ class VllmConfig:
self.model_config.disable_cascade_attn = True self.model_config.disable_cascade_attn = True
self.cache_config.enable_prefix_caching = False self.cache_config.enable_prefix_caching = False
if self.parallel_config.enable_microbatching and \
self.compilation_config.level >= CompilationLevel.PIECEWISE:
# Microbatching is not supported with piecewise compilation yet.
# More specifically piecewise cuda-graphs
logger.warning_once(
"Piecewise compilation is not supported with "
"microbatching. Disabling piecewiseching compilation.")
self.compilation_config.level = CompilationLevel.NO_COMPILATION
if (self.kv_events_config is not None if (self.kv_events_config is not None
and self.kv_events_config.enable_kv_cache_events and self.kv_events_config.enable_kv_cache_events
and not self.cache_config.enable_prefix_caching): and not self.cache_config.enable_prefix_caching):

View File

@ -109,7 +109,8 @@ def get_forward_context() -> ForwardContext:
def create_forward_context(attn_metadata: Any, def create_forward_context(attn_metadata: Any,
vllm_config: VllmConfig, vllm_config: VllmConfig,
virtual_engine: int = 0, virtual_engine: int = 0,
num_tokens: int = 0): num_tokens: Optional[int] = None,
num_tokens_across_dp: Optional[torch.Tensor] = None):
dp_metadata: Optional[DPMetadata] = None dp_metadata: Optional[DPMetadata] = None
if vllm_config.parallel_config.data_parallel_size > 1 and ( if vllm_config.parallel_config.data_parallel_size > 1 and (
attn_metadata is not None or num_tokens is not None): attn_metadata is not None or num_tokens is not None):
@ -143,7 +144,8 @@ def override_forward_context(forward_context: Optional[ForwardContext]):
def set_forward_context(attn_metadata: Any, def set_forward_context(attn_metadata: Any,
vllm_config: VllmConfig, vllm_config: VllmConfig,
virtual_engine: int = 0, virtual_engine: int = 0,
num_tokens: int = 0): num_tokens: Optional[int] = None,
num_tokens_across_dp: Optional[torch.Tensor] = None):
"""A context manager that stores the current forward context, """A context manager that stores the current forward context,
can be attention metadata, etc. can be attention metadata, etc.
Here we can inject common logic for every model forward pass. Here we can inject common logic for every model forward pass.
@ -154,7 +156,8 @@ def set_forward_context(attn_metadata: Any,
forward_start_time = time.perf_counter() forward_start_time = time.perf_counter()
forward_context = create_forward_context(attn_metadata, vllm_config, forward_context = create_forward_context(attn_metadata, vllm_config,
virtual_engine, num_tokens) virtual_engine, num_tokens,
num_tokens_across_dp)
try: try:
with override_forward_context(forward_context): with override_forward_context(forward_context):

View File

@ -732,7 +732,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
)) ))
for layer_name in kv_cache_group_spec.layer_names: for layer_name in kv_cache_group_spec.layer_names:
assert type(attn_metadata) is list assert type(attn_metadata) is list
assert attn_metadata_i is not None # assert attn_metadata_i is not None
# What if it's None? Do we still add it to the list? # What if it's None? Do we still add it to the list?
attn_metadata[ubid][layer_name] = attn_metadata_i attn_metadata[ubid][layer_name] = attn_metadata_i
else: else:
@ -1312,7 +1312,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
def model_inputs(tokens_slice: slice, use_dummy_input: bool) -> tuple: def model_inputs(tokens_slice: slice, use_dummy_input: bool) -> tuple:
if use_dummy_input: if use_dummy_input:
assert num_dummy_tokens == 1 # assert num_dummy_tokens == 1
return self._get_dummy_model_inputs(num_dummy_tokens) return self._get_dummy_model_inputs(num_dummy_tokens)
else: else:
assert scheduler_output is not None assert scheduler_output is not None