mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-06 20:17:15 +08:00
first round of fixes
Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
parent
18e7d6c7b8
commit
539c0c3add
@ -1752,6 +1752,18 @@ class ParallelConfig:
|
||||
disable_custom_all_reduce: bool = False
|
||||
"""Disable the custom all-reduce kernel and fall back to NCCL."""
|
||||
|
||||
enable_microbatching: bool = False
|
||||
"""Enable microbatching for the model executor."""
|
||||
|
||||
always_microbatch_if_enabled: bool = True
|
||||
"""Always microbatch if microbatching is enabled. Easier to sync between
|
||||
dp workers."""
|
||||
|
||||
microbatching_token_threshold: int = 4
|
||||
"""The threshold for microbatching. If the number of tokens in the
|
||||
request is greater than this threshold, microbatching will be used.
|
||||
Otherwise, the request will be processed in a single batch."""
|
||||
|
||||
tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
|
||||
"""This parameter is deprecated and will be removed in a future release.
|
||||
Please remove it from your configs"""
|
||||
@ -4434,6 +4446,15 @@ class VllmConfig:
|
||||
self.model_config.disable_cascade_attn = True
|
||||
self.cache_config.enable_prefix_caching = False
|
||||
|
||||
if self.parallel_config.enable_microbatching and \
|
||||
self.compilation_config.level >= CompilationLevel.PIECEWISE:
|
||||
# Microbatching is not supported with piecewise compilation yet.
|
||||
# More specifically piecewise cuda-graphs
|
||||
logger.warning_once(
|
||||
"Piecewise compilation is not supported with "
|
||||
"microbatching. Disabling piecewiseching compilation.")
|
||||
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
|
||||
if (self.kv_events_config is not None
|
||||
and self.kv_events_config.enable_kv_cache_events
|
||||
and not self.cache_config.enable_prefix_caching):
|
||||
|
||||
@ -109,7 +109,8 @@ def get_forward_context() -> ForwardContext:
|
||||
def create_forward_context(attn_metadata: Any,
|
||||
vllm_config: VllmConfig,
|
||||
virtual_engine: int = 0,
|
||||
num_tokens: int = 0):
|
||||
num_tokens: Optional[int] = None,
|
||||
num_tokens_across_dp: Optional[torch.Tensor] = None):
|
||||
dp_metadata: Optional[DPMetadata] = None
|
||||
if vllm_config.parallel_config.data_parallel_size > 1 and (
|
||||
attn_metadata is not None or num_tokens is not None):
|
||||
@ -143,7 +144,8 @@ def override_forward_context(forward_context: Optional[ForwardContext]):
|
||||
def set_forward_context(attn_metadata: Any,
|
||||
vllm_config: VllmConfig,
|
||||
virtual_engine: int = 0,
|
||||
num_tokens: int = 0):
|
||||
num_tokens: Optional[int] = None,
|
||||
num_tokens_across_dp: Optional[torch.Tensor] = None):
|
||||
"""A context manager that stores the current forward context,
|
||||
can be attention metadata, etc.
|
||||
Here we can inject common logic for every model forward pass.
|
||||
@ -154,7 +156,8 @@ def set_forward_context(attn_metadata: Any,
|
||||
forward_start_time = time.perf_counter()
|
||||
|
||||
forward_context = create_forward_context(attn_metadata, vllm_config,
|
||||
virtual_engine, num_tokens)
|
||||
virtual_engine, num_tokens,
|
||||
num_tokens_across_dp)
|
||||
|
||||
try:
|
||||
with override_forward_context(forward_context):
|
||||
|
||||
@ -732,7 +732,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
))
|
||||
for layer_name in kv_cache_group_spec.layer_names:
|
||||
assert type(attn_metadata) is list
|
||||
assert attn_metadata_i is not None
|
||||
# assert attn_metadata_i is not None
|
||||
# What if it's None? Do we still add it to the list?
|
||||
attn_metadata[ubid][layer_name] = attn_metadata_i
|
||||
else:
|
||||
@ -1312,7 +1312,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
def model_inputs(tokens_slice: slice, use_dummy_input: bool) -> tuple:
|
||||
if use_dummy_input:
|
||||
assert num_dummy_tokens == 1
|
||||
# assert num_dummy_tokens == 1
|
||||
return self._get_dummy_model_inputs(num_dummy_tokens)
|
||||
else:
|
||||
assert scheduler_output is not None
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user