mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-05 07:29:07 +08:00
config format
Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
parent
92e0cc79a8
commit
44a595f6d6
@ -1740,14 +1740,14 @@ class ParallelConfig:
|
|||||||
|
|
||||||
rank: int = 0
|
rank: int = 0
|
||||||
"""Global rank in distributed setup."""
|
"""Global rank in distributed setup."""
|
||||||
|
|
||||||
enable_microbatching: bool = False
|
enable_microbatching: bool = False
|
||||||
"""Enable microbatching for the model executor."""
|
"""Enable microbatching for the model executor."""
|
||||||
|
|
||||||
always_microbatch_if_enabled: bool = True
|
always_microbatch_if_enabled: bool = True
|
||||||
"""Always microbatch if microbatching is enabled. Easier to sync bewteen
|
"""Always microbatch if microbatching is enabled. Easier to sync between
|
||||||
dp workers."""
|
dp workers."""
|
||||||
|
|
||||||
microbatching_token_threshold: int = 4
|
microbatching_token_threshold: int = 4
|
||||||
"""The threshold for microbatching. If the number of tokens in the
|
"""The threshold for microbatching. If the number of tokens in the
|
||||||
request is greater than this threshold, microbatching will be used.
|
request is greater than this threshold, microbatching will be used.
|
||||||
@ -4324,16 +4324,16 @@ class VllmConfig:
|
|||||||
"full_cuda_graph is not supported with "
|
"full_cuda_graph is not supported with "
|
||||||
"cascade attention. Disabling cascade attention.")
|
"cascade attention. Disabling cascade attention.")
|
||||||
self.model_config.disable_cascade_attn = True
|
self.model_config.disable_cascade_attn = True
|
||||||
|
|
||||||
if self.parallel_config.enable_microbatching:
|
if self.parallel_config.enable_microbatching and \
|
||||||
|
self.compilation_config.level >= CompilationLevel.PIECEWISE:
|
||||||
# Microbatching is not supported with piecewise compilation yet.
|
# Microbatching is not supported with piecewise compilation yet.
|
||||||
# More specifically piecewise cuda-graphs
|
# More specifically piecewise cuda-graphs
|
||||||
if self.compilation_config.level >= CompilationLevel.PIECEWISE:
|
logger.warning_once(
|
||||||
logger.warning_once(
|
"Piecewise compilation is not supported with "
|
||||||
"Piecewise compilation is not supported with "
|
"microbatching. Disabling piecewiseching compilation.")
|
||||||
"microbatching. Disabling piecewiseching compilation.")
|
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||||
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
|
||||||
|
|
||||||
|
|
||||||
if self.model_config and self.model_config.use_mla and \
|
if self.model_config and self.model_config.use_mla and \
|
||||||
not (current_platform.is_cuda() or current_platform.is_rocm()):
|
not (current_platform.is_cuda() or current_platform.is_rocm()):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user