mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 08:04:58 +08:00
[Misc] Enable multi-step output streaming by default (#9047)
This commit is contained in:
parent
aeb37c2a72
commit
303d44790a
@ -145,7 +145,7 @@ class EngineArgs:
|
||||
max_cpu_loras: Optional[int] = None
|
||||
device: str = 'auto'
|
||||
num_scheduler_steps: int = 1
|
||||
multi_step_stream_outputs: bool = False
|
||||
multi_step_stream_outputs: bool = True
|
||||
ray_workers_use_nsight: bool = False
|
||||
num_gpu_blocks_override: Optional[int] = None
|
||||
num_lookahead_slots: int = 0
|
||||
@ -603,13 +603,17 @@ class EngineArgs:
|
||||
|
||||
parser.add_argument(
|
||||
'--multi-step-stream-outputs',
|
||||
action='store_true',
|
||||
help='If True, then multi-step will stream outputs for every step')
|
||||
action=StoreBoolean,
|
||||
default=EngineArgs.multi_step_stream_outputs,
|
||||
nargs="?",
|
||||
const="True",
|
||||
help='If False, then multi-step will stream outputs at the end '
|
||||
'of all steps')
|
||||
parser.add_argument(
|
||||
'--scheduler-delay-factor',
|
||||
type=float,
|
||||
default=EngineArgs.scheduler_delay_factor,
|
||||
help='Apply a delay (of delay factor multiplied by previous'
|
||||
help='Apply a delay (of delay factor multiplied by previous '
|
||||
'prompt latency) before scheduling next prompt.')
|
||||
parser.add_argument(
|
||||
'--enable-chunked-prefill',
|
||||
@ -632,7 +636,7 @@ class EngineArgs:
|
||||
type=nullable_str,
|
||||
choices=[*QUANTIZATION_METHODS, None],
|
||||
default=EngineArgs.speculative_model_quantization,
|
||||
help='Method used to quantize the weights of speculative model.'
|
||||
help='Method used to quantize the weights of speculative model. '
|
||||
'If None, we first check the `quantization_config` '
|
||||
'attribute in the model config file. If that is '
|
||||
'None, we assume the model weights are not '
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user