Move the last arguments in arg_utils.py to be in their final groups (#17531)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-05-01 18:31:44 +01:00 committed by GitHub
parent f2e7af9b86
commit 6768ff4a22
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 132 additions and 135 deletions

View File

@ -1637,7 +1637,7 @@ class ParallelConfig:
"""Use expert parallelism instead of tensor parallelism for MoE layers."""
max_parallel_loading_workers: Optional[int] = None
"""Maximum number of parallal loading workers when loading model
"""Maximum number of parallel loading workers when loading model
sequentially in multiple batches. To avoid RAM OOM when using tensor
parallel and large models."""

View File

@ -474,15 +474,21 @@ class EngineArgs:
title="LoadConfig",
description=LoadConfig.__doc__,
)
load_group.add_argument('--load-format',
load_group.add_argument("--load-format",
choices=[f.value for f in LoadFormat],
**load_kwargs["load_format"])
load_group.add_argument('--download-dir',
load_group.add_argument("--download-dir",
**load_kwargs["download_dir"])
load_group.add_argument('--model-loader-extra-config',
load_group.add_argument("--model-loader-extra-config",
**load_kwargs["model_loader_extra_config"])
load_group.add_argument('--use-tqdm-on-load',
load_group.add_argument("--ignore-patterns",
**load_kwargs["ignore_patterns"])
load_group.add_argument("--use-tqdm-on-load",
**load_kwargs["use_tqdm_on_load"])
load_group.add_argument('--qlora-adapter-name-or-path',
type=str,
default=None,
help='Name or path of the QLoRA adapter.')
# Guided decoding arguments
guided_decoding_kwargs = get_kwargs(DecodingConfig)
@ -501,6 +507,14 @@ class EngineArgs:
guided_decoding_group.add_argument(
"--guided-decoding-disable-additional-properties",
**guided_decoding_kwargs["disable_additional_properties"])
guided_decoding_group.add_argument(
"--enable-reasoning",
action=argparse.BooleanOptionalAction,
help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as "
"of v0.8.6. Use `--reasoning-parser` to specify the reasoning "
"parser backend insteadThis flag (`--enable-reasoning`) will be "
"removed in v0.10.0. When `--reasoning-parser` is specified, "
"reasoning mode is automatically enabled.")
guided_decoding_group.add_argument(
"--reasoning-parser",
# This choices is a special case because it's not static
@ -514,27 +528,31 @@ class EngineArgs:
description=ParallelConfig.__doc__,
)
parallel_group.add_argument(
'--distributed-executor-backend',
"--distributed-executor-backend",
**parallel_kwargs["distributed_executor_backend"])
parallel_group.add_argument(
'--pipeline-parallel-size', '-pp',
"--pipeline-parallel-size", "-pp",
**parallel_kwargs["pipeline_parallel_size"])
parallel_group.add_argument('--tensor-parallel-size', '-tp',
parallel_group.add_argument("--tensor-parallel-size", "-tp",
**parallel_kwargs["tensor_parallel_size"])
parallel_group.add_argument('--data-parallel-size', '-dp',
parallel_group.add_argument("--data-parallel-size", "-dp",
**parallel_kwargs["data_parallel_size"])
parallel_group.add_argument(
'--enable-expert-parallel',
"--enable-expert-parallel",
**parallel_kwargs["enable_expert_parallel"])
parallel_group.add_argument(
'--max-parallel-loading-workers',
"--max-parallel-loading-workers",
**parallel_kwargs["max_parallel_loading_workers"])
parallel_group.add_argument(
'--ray-workers-use-nsight',
"--ray-workers-use-nsight",
**parallel_kwargs["ray_workers_use_nsight"])
parallel_group.add_argument(
'--disable-custom-all-reduce',
"--disable-custom-all-reduce",
**parallel_kwargs["disable_custom_all_reduce"])
parallel_group.add_argument("--worker-cls",
**parallel_kwargs["worker_cls"])
parallel_group.add_argument("--worker-extension-cls",
**parallel_kwargs["worker_extension_cls"])
# KV cache arguments
cache_kwargs = get_kwargs(CacheConfig)
@ -542,47 +560,34 @@ class EngineArgs:
title="CacheConfig",
description=CacheConfig.__doc__,
)
cache_group.add_argument('--block-size', **cache_kwargs["block_size"])
cache_group.add_argument('--gpu-memory-utilization',
cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
cache_group.add_argument("--gpu-memory-utilization",
**cache_kwargs["gpu_memory_utilization"])
cache_group.add_argument('--swap-space', **cache_kwargs["swap_space"])
cache_group.add_argument('--kv-cache-dtype',
cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
cache_group.add_argument("--kv-cache-dtype",
**cache_kwargs["cache_dtype"])
cache_group.add_argument('--num-gpu-blocks-override',
cache_group.add_argument("--num-gpu-blocks-override",
**cache_kwargs["num_gpu_blocks_override"])
cache_group.add_argument("--enable-prefix-caching",
**cache_kwargs["enable_prefix_caching"])
cache_group.add_argument("--prefix-caching-hash-algo",
**cache_kwargs["prefix_caching_hash_algo"])
cache_group.add_argument('--cpu-offload-gb',
cache_group.add_argument("--cpu-offload-gb",
**cache_kwargs["cpu_offload_gb"])
cache_group.add_argument('--calculate-kv-scales',
cache_group.add_argument("--calculate-kv-scales",
**cache_kwargs["calculate_kv_scales"])
parser.add_argument('--use-v2-block-manager',
action='store_true',
default=True,
help='[DEPRECATED] block manager v1 has been '
'removed and SelfAttnBlockSpaceManager (i.e. '
'block manager v2) is now the default. '
'Setting this flag to True or False'
' has no effect on vLLM behavior.')
parser.add_argument('--disable-log-stats',
action='store_true',
help='Disable logging statistics.')
# Tokenizer arguments
tokenizer_kwargs = get_kwargs(TokenizerPoolConfig)
tokenizer_group = parser.add_argument_group(
title="TokenizerPoolConfig",
description=TokenizerPoolConfig.__doc__,
)
tokenizer_group.add_argument('--tokenizer-pool-size',
tokenizer_group.add_argument("--tokenizer-pool-size",
**tokenizer_kwargs["pool_size"])
tokenizer_group.add_argument('--tokenizer-pool-type',
tokenizer_group.add_argument("--tokenizer-pool-type",
**tokenizer_kwargs["pool_type"])
tokenizer_group.add_argument('--tokenizer-pool-extra-config',
tokenizer_group.add_argument("--tokenizer-pool-extra-config",
**tokenizer_kwargs["extra_config"])
# Multimodal related configs
@ -591,13 +596,13 @@ class EngineArgs:
title="MultiModalConfig",
description=MultiModalConfig.__doc__,
)
multimodal_group.add_argument('--limit-mm-per-prompt',
multimodal_group.add_argument("--limit-mm-per-prompt",
**multimodal_kwargs["limit_per_prompt"])
multimodal_group.add_argument(
'--mm-processor-kwargs',
"--mm-processor-kwargs",
**multimodal_kwargs["mm_processor_kwargs"])
multimodal_group.add_argument(
'--disable-mm-preprocessor-cache',
"--disable-mm-preprocessor-cache",
**multimodal_kwargs["disable_mm_preprocessor_cache"])
# LoRA related configs
@ -607,25 +612,25 @@ class EngineArgs:
description=LoRAConfig.__doc__,
)
lora_group.add_argument(
'--enable-lora',
"--enable-lora",
action=argparse.BooleanOptionalAction,
help='If True, enable handling of LoRA adapters.')
lora_group.add_argument('--enable-lora-bias',
help="If True, enable handling of LoRA adapters.")
lora_group.add_argument("--enable-lora-bias",
**lora_kwargs["bias_enabled"])
lora_group.add_argument('--max-loras', **lora_kwargs["max_loras"])
lora_group.add_argument('--max-lora-rank',
lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
lora_group.add_argument("--max-lora-rank",
**lora_kwargs["max_lora_rank"])
lora_group.add_argument('--lora-extra-vocab-size',
lora_group.add_argument("--lora-extra-vocab-size",
**lora_kwargs["lora_extra_vocab_size"])
lora_group.add_argument(
'--lora-dtype',
"--lora-dtype",
**lora_kwargs["lora_dtype"],
)
lora_group.add_argument('--long-lora-scaling-factors',
lora_group.add_argument("--long-lora-scaling-factors",
**lora_kwargs["long_lora_scaling_factors"])
lora_group.add_argument('--max-cpu-loras',
lora_group.add_argument("--max-cpu-loras",
**lora_kwargs["max_cpu_loras"])
lora_group.add_argument('--fully-sharded-loras',
lora_group.add_argument("--fully-sharded-loras",
**lora_kwargs["fully_sharded_loras"])
# PromptAdapter related configs
@ -635,14 +640,14 @@ class EngineArgs:
description=PromptAdapterConfig.__doc__,
)
prompt_adapter_group.add_argument(
'--enable-prompt-adapter',
"--enable-prompt-adapter",
action=argparse.BooleanOptionalAction,
help='If True, enable handling of PromptAdapters.')
help="If True, enable handling of PromptAdapters.")
prompt_adapter_group.add_argument(
'--max-prompt-adapters',
"--max-prompt-adapters",
**prompt_adapter_kwargs["max_prompt_adapters"])
prompt_adapter_group.add_argument(
'--max-prompt-adapter-token',
"--max-prompt-adapter-token",
**prompt_adapter_kwargs["max_prompt_adapter_token"])
# Device arguments
@ -659,25 +664,11 @@ class EngineArgs:
description=SpeculativeConfig.__doc__,
)
speculative_group.add_argument(
'--speculative-config',
"--speculative-config",
type=json.loads,
default=None,
help='The configurations for speculative decoding.'
' Should be a JSON string.')
parser.add_argument(
'--ignore-patterns',
action="append",
type=str,
default=[],
help="The pattern(s) to ignore when loading the model."
"Default to `original/**/*` to avoid repeated loading of llama's "
"checkpoints.")
parser.add_argument('--qlora-adapter-name-or-path',
type=str,
default=None,
help='Name or path of the QLoRA adapter.')
help="The configurations for speculative decoding. Should be a "
"JSON string.")
# Observability arguments
observability_kwargs = get_kwargs(ObservabilityConfig)
@ -710,9 +701,9 @@ class EngineArgs:
description=SchedulerConfig.__doc__,
)
scheduler_group.add_argument(
'--max-num-batched-tokens',
"--max-num-batched-tokens",
**scheduler_kwargs["max_num_batched_tokens"])
scheduler_group.add_argument('--max-num-seqs',
scheduler_group.add_argument("--max-num-seqs",
**scheduler_kwargs["max_num_seqs"])
scheduler_group.add_argument(
"--max-num-partial-prefills",
@ -723,70 +714,78 @@ class EngineArgs:
scheduler_group.add_argument(
"--long-prefill-token-threshold",
**scheduler_kwargs["long_prefill_token_threshold"])
scheduler_group.add_argument('--num-lookahead-slots',
scheduler_group.add_argument("--num-lookahead-slots",
**scheduler_kwargs["num_lookahead_slots"])
scheduler_group.add_argument('--scheduler-delay-factor',
scheduler_group.add_argument("--scheduler-delay-factor",
**scheduler_kwargs["delay_factor"])
scheduler_group.add_argument('--preemption-mode',
scheduler_group.add_argument("--preemption-mode",
**scheduler_kwargs["preemption_mode"])
scheduler_group.add_argument('--num-scheduler-steps',
scheduler_group.add_argument("--num-scheduler-steps",
**scheduler_kwargs["num_scheduler_steps"])
scheduler_group.add_argument(
'--multi-step-stream-outputs',
"--multi-step-stream-outputs",
**scheduler_kwargs["multi_step_stream_outputs"])
scheduler_group.add_argument('--scheduling-policy',
scheduler_group.add_argument("--scheduling-policy",
**scheduler_kwargs["policy"])
scheduler_group.add_argument(
'--enable-chunked-prefill',
"--enable-chunked-prefill",
**scheduler_kwargs["enable_chunked_prefill"])
scheduler_group.add_argument(
"--disable-chunked-mm-input",
**scheduler_kwargs["disable_chunked_mm_input"])
parser.add_argument('--scheduler-cls',
**scheduler_kwargs["scheduler_cls"])
scheduler_group.add_argument("--scheduler-cls",
**scheduler_kwargs["scheduler_cls"])
parser.add_argument('--compilation-config',
'-O',
type=CompilationConfig.from_cli,
default=None,
help='torch.compile configuration for the model. '
'When it is a number (0, 1, 2, 3), it will be '
'interpreted as the optimization level.\n'
'NOTE: level 0 is the default level without '
'any optimization. level 1 and 2 are for internal '
'testing only. level 3 is the recommended level '
'for production.\n'
'To specify the full compilation config, '
'use a JSON string, e.g. ``{"level": 3, '
'"cudagraph_capture_sizes": [1, 2, 4, 8]}``\n'
'Following the convention of traditional '
'compilers, using ``-O`` without space is also '
'supported. ``-O3`` is equivalent to ``-O 3``.')
# Compilation arguments
# compilation_kwargs = get_kwargs(CompilationConfig)
compilation_group = parser.add_argument_group(
title="CompilationConfig",
description=CompilationConfig.__doc__,
)
compilation_group.add_argument(
"--compilation-config",
"-O",
type=CompilationConfig.from_cli,
default=None,
help="torch.compile configuration for the model. "
"When it is a number (0, 1, 2, 3), it will be "
"interpreted as the optimization level.\n"
"NOTE: level 0 is the default level without "
"any optimization. level 1 and 2 are for internal "
"testing only. level 3 is the recommended level "
"for production.\n"
"To specify the full compilation config, "
"use a JSON string, e.g. ``{\"level\": 3, "
"\"cudagraph_capture_sizes\": [1, 2, 4, 8]}``\n"
"Following the convention of traditional "
"compilers, using ``-O`` without space is also "
"supported. ``-O3`` is equivalent to ``-O 3``.")
parser.add_argument('--kv-transfer-config',
type=KVTransferConfig.from_cli,
default=None,
help='The configurations for distributed KV cache '
'transfer. Should be a JSON string.')
parser.add_argument('--kv-events-config',
type=KVEventsConfig.from_cli,
default=None,
help='The configurations for event publishing.')
# KVTransfer arguments
# kv_transfer_kwargs = get_kwargs(KVTransferConfig)
kv_transfer_group = parser.add_argument_group(
title="KVTransferConfig",
description=KVTransferConfig.__doc__,
)
kv_transfer_group.add_argument(
"--kv-transfer-config",
type=KVTransferConfig.from_cli,
default=None,
help="The configurations for distributed KV cache "
"transfer. Should be a JSON string.")
kv_transfer_group.add_argument(
'--kv-events-config',
type=KVEventsConfig.from_cli,
default=None,
help='The configurations for event publishing.')
parser.add_argument(
'--worker-cls',
type=str,
default="auto",
help='The worker class to use for distributed execution.')
parser.add_argument(
'--worker-extension-cls',
type=str,
default="",
help='The worker extension class on top of the worker cls, '
'it is useful if you just want to add new functions to the worker '
'class without changing the existing functions.')
parser.add_argument(
# vLLM arguments
# vllm_kwargs = get_kwargs(VllmConfig)
vllm_group = parser.add_argument_group(
title="VllmConfig",
description=VllmConfig.__doc__,
)
vllm_group.add_argument(
"--additional-config",
type=json.loads,
default=None,
@ -795,20 +794,18 @@ class EngineArgs:
"configs are valid for the platform you are using. The input format"
" is like '{\"config_key\":\"config_value\"}'")
parser.add_argument(
"--enable-reasoning",
action="store_true",
default=False,
help=
"[DEPRECATED] " \
"The --enable-reasoning flag is deprecated as of v0.8.6. "
"Use --reasoning-parser to specify " \
"the reasoning parser backend instead. "
"This flag (--enable-reasoning) will be " \
"removed in v0.10.0. "
"When --reasoning-parser is specified, " \
"reasoning mode is automatically enabled."
)
# Other arguments
parser.add_argument('--use-v2-block-manager',
action='store_true',
default=True,
help='[DEPRECATED] block manager v1 has been '
'removed and SelfAttnBlockSpaceManager (i.e. '
'block manager v2) is now the default. '
'Setting this flag to True or False'
' has no effect on vLLM behavior.')
parser.add_argument('--disable-log-stats',
action='store_true',
help='Disable logging statistics.')
return parser