mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-22 09:51:19 +08:00
first round of fixes
Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
parent
18e7d6c7b8
commit
539c0c3add
@ -1752,6 +1752,18 @@ class ParallelConfig:
|
|||||||
disable_custom_all_reduce: bool = False
|
disable_custom_all_reduce: bool = False
|
||||||
"""Disable the custom all-reduce kernel and fall back to NCCL."""
|
"""Disable the custom all-reduce kernel and fall back to NCCL."""
|
||||||
|
|
||||||
|
enable_microbatching: bool = False
|
||||||
|
"""Enable microbatching for the model executor."""
|
||||||
|
|
||||||
|
always_microbatch_if_enabled: bool = True
|
||||||
|
"""Always microbatch if microbatching is enabled. Easier to sync between
|
||||||
|
dp workers."""
|
||||||
|
|
||||||
|
microbatching_token_threshold: int = 4
|
||||||
|
"""The threshold for microbatching. If the number of tokens in the
|
||||||
|
request is greater than this threshold, microbatching will be used.
|
||||||
|
Otherwise, the request will be processed in a single batch."""
|
||||||
|
|
||||||
tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
|
tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
|
||||||
"""This parameter is deprecated and will be removed in a future release.
|
"""This parameter is deprecated and will be removed in a future release.
|
||||||
Please remove it from your configs"""
|
Please remove it from your configs"""
|
||||||
@ -4434,6 +4446,15 @@ class VllmConfig:
|
|||||||
self.model_config.disable_cascade_attn = True
|
self.model_config.disable_cascade_attn = True
|
||||||
self.cache_config.enable_prefix_caching = False
|
self.cache_config.enable_prefix_caching = False
|
||||||
|
|
||||||
|
if self.parallel_config.enable_microbatching and \
|
||||||
|
self.compilation_config.level >= CompilationLevel.PIECEWISE:
|
||||||
|
# Microbatching is not supported with piecewise compilation yet.
|
||||||
|
# More specifically piecewise cuda-graphs
|
||||||
|
logger.warning_once(
|
||||||
|
"Piecewise compilation is not supported with "
|
||||||
|
"microbatching. Disabling piecewiseching compilation.")
|
||||||
|
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||||
|
|
||||||
if (self.kv_events_config is not None
|
if (self.kv_events_config is not None
|
||||||
and self.kv_events_config.enable_kv_cache_events
|
and self.kv_events_config.enable_kv_cache_events
|
||||||
and not self.cache_config.enable_prefix_caching):
|
and not self.cache_config.enable_prefix_caching):
|
||||||
|
|||||||
@ -109,7 +109,8 @@ def get_forward_context() -> ForwardContext:
|
|||||||
def create_forward_context(attn_metadata: Any,
|
def create_forward_context(attn_metadata: Any,
|
||||||
vllm_config: VllmConfig,
|
vllm_config: VllmConfig,
|
||||||
virtual_engine: int = 0,
|
virtual_engine: int = 0,
|
||||||
num_tokens: int = 0):
|
num_tokens: Optional[int] = None,
|
||||||
|
num_tokens_across_dp: Optional[torch.Tensor] = None):
|
||||||
dp_metadata: Optional[DPMetadata] = None
|
dp_metadata: Optional[DPMetadata] = None
|
||||||
if vllm_config.parallel_config.data_parallel_size > 1 and (
|
if vllm_config.parallel_config.data_parallel_size > 1 and (
|
||||||
attn_metadata is not None or num_tokens is not None):
|
attn_metadata is not None or num_tokens is not None):
|
||||||
@ -143,7 +144,8 @@ def override_forward_context(forward_context: Optional[ForwardContext]):
|
|||||||
def set_forward_context(attn_metadata: Any,
|
def set_forward_context(attn_metadata: Any,
|
||||||
vllm_config: VllmConfig,
|
vllm_config: VllmConfig,
|
||||||
virtual_engine: int = 0,
|
virtual_engine: int = 0,
|
||||||
num_tokens: int = 0):
|
num_tokens: Optional[int] = None,
|
||||||
|
num_tokens_across_dp: Optional[torch.Tensor] = None):
|
||||||
"""A context manager that stores the current forward context,
|
"""A context manager that stores the current forward context,
|
||||||
can be attention metadata, etc.
|
can be attention metadata, etc.
|
||||||
Here we can inject common logic for every model forward pass.
|
Here we can inject common logic for every model forward pass.
|
||||||
@ -154,7 +156,8 @@ def set_forward_context(attn_metadata: Any,
|
|||||||
forward_start_time = time.perf_counter()
|
forward_start_time = time.perf_counter()
|
||||||
|
|
||||||
forward_context = create_forward_context(attn_metadata, vllm_config,
|
forward_context = create_forward_context(attn_metadata, vllm_config,
|
||||||
virtual_engine, num_tokens)
|
virtual_engine, num_tokens,
|
||||||
|
num_tokens_across_dp)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with override_forward_context(forward_context):
|
with override_forward_context(forward_context):
|
||||||
|
|||||||
@ -732,7 +732,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
))
|
))
|
||||||
for layer_name in kv_cache_group_spec.layer_names:
|
for layer_name in kv_cache_group_spec.layer_names:
|
||||||
assert type(attn_metadata) is list
|
assert type(attn_metadata) is list
|
||||||
assert attn_metadata_i is not None
|
# assert attn_metadata_i is not None
|
||||||
# What if it's None? Do we still add it to the list?
|
# What if it's None? Do we still add it to the list?
|
||||||
attn_metadata[ubid][layer_name] = attn_metadata_i
|
attn_metadata[ubid][layer_name] = attn_metadata_i
|
||||||
else:
|
else:
|
||||||
@ -1312,7 +1312,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
|
|
||||||
def model_inputs(tokens_slice: slice, use_dummy_input: bool) -> tuple:
|
def model_inputs(tokens_slice: slice, use_dummy_input: bool) -> tuple:
|
||||||
if use_dummy_input:
|
if use_dummy_input:
|
||||||
assert num_dummy_tokens == 1
|
# assert num_dummy_tokens == 1
|
||||||
return self._get_dummy_model_inputs(num_dummy_tokens)
|
return self._get_dummy_model_inputs(num_dummy_tokens)
|
||||||
else:
|
else:
|
||||||
assert scheduler_output is not None
|
assert scheduler_output is not None
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user