From 6768ff4a222b89dac3e2f49af8772216db01952b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 1 May 2025 18:31:44 +0100 Subject: [PATCH] Move the last arguments in `arg_utils.py` to be in their final groups (#17531) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config.py | 2 +- vllm/engine/arg_utils.py | 265 +++++++++++++++++++-------------------- 2 files changed, 132 insertions(+), 135 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 13b8c33785387..864903ddc4468 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1637,7 +1637,7 @@ class ParallelConfig: """Use expert parallelism instead of tensor parallelism for MoE layers.""" max_parallel_loading_workers: Optional[int] = None - """Maximum number of parallal loading workers when loading model + """Maximum number of parallel loading workers when loading model sequentially in multiple batches. To avoid RAM OOM when using tensor parallel and large models.""" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 1d9b6b47ea851..ed32be7cba593 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -474,15 +474,21 @@ class EngineArgs: title="LoadConfig", description=LoadConfig.__doc__, ) - load_group.add_argument('--load-format', + load_group.add_argument("--load-format", choices=[f.value for f in LoadFormat], **load_kwargs["load_format"]) - load_group.add_argument('--download-dir', + load_group.add_argument("--download-dir", **load_kwargs["download_dir"]) - load_group.add_argument('--model-loader-extra-config', + load_group.add_argument("--model-loader-extra-config", **load_kwargs["model_loader_extra_config"]) - load_group.add_argument('--use-tqdm-on-load', + load_group.add_argument("--ignore-patterns", + **load_kwargs["ignore_patterns"]) + load_group.add_argument("--use-tqdm-on-load", **load_kwargs["use_tqdm_on_load"]) + load_group.add_argument('--qlora-adapter-name-or-path', + type=str, + default=None, + help='Name or path of the QLoRA adapter.') # Guided decoding arguments guided_decoding_kwargs = get_kwargs(DecodingConfig) @@ -501,6 +507,14 @@ class EngineArgs: guided_decoding_group.add_argument( "--guided-decoding-disable-additional-properties", **guided_decoding_kwargs["disable_additional_properties"]) + guided_decoding_group.add_argument( + "--enable-reasoning", + action=argparse.BooleanOptionalAction, + help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as " + "of v0.8.6. Use `--reasoning-parser` to specify the reasoning " + "parser backend insteadThis flag (`--enable-reasoning`) will be " + "removed in v0.10.0. When `--reasoning-parser` is specified, " + "reasoning mode is automatically enabled.") guided_decoding_group.add_argument( "--reasoning-parser", # This choices is a special case because it's not static @@ -514,27 +528,31 @@ class EngineArgs: description=ParallelConfig.__doc__, ) parallel_group.add_argument( - '--distributed-executor-backend', + "--distributed-executor-backend", **parallel_kwargs["distributed_executor_backend"]) parallel_group.add_argument( - '--pipeline-parallel-size', '-pp', + "--pipeline-parallel-size", "-pp", **parallel_kwargs["pipeline_parallel_size"]) - parallel_group.add_argument('--tensor-parallel-size', '-tp', + parallel_group.add_argument("--tensor-parallel-size", "-tp", **parallel_kwargs["tensor_parallel_size"]) - parallel_group.add_argument('--data-parallel-size', '-dp', + parallel_group.add_argument("--data-parallel-size", "-dp", **parallel_kwargs["data_parallel_size"]) parallel_group.add_argument( - '--enable-expert-parallel', + "--enable-expert-parallel", **parallel_kwargs["enable_expert_parallel"]) parallel_group.add_argument( - '--max-parallel-loading-workers', + "--max-parallel-loading-workers", **parallel_kwargs["max_parallel_loading_workers"]) parallel_group.add_argument( - '--ray-workers-use-nsight', + "--ray-workers-use-nsight", **parallel_kwargs["ray_workers_use_nsight"]) parallel_group.add_argument( - '--disable-custom-all-reduce', + "--disable-custom-all-reduce", **parallel_kwargs["disable_custom_all_reduce"]) + parallel_group.add_argument("--worker-cls", + **parallel_kwargs["worker_cls"]) + parallel_group.add_argument("--worker-extension-cls", + **parallel_kwargs["worker_extension_cls"]) # KV cache arguments cache_kwargs = get_kwargs(CacheConfig) @@ -542,47 +560,34 @@ class EngineArgs: title="CacheConfig", description=CacheConfig.__doc__, ) - cache_group.add_argument('--block-size', **cache_kwargs["block_size"]) - cache_group.add_argument('--gpu-memory-utilization', + cache_group.add_argument("--block-size", **cache_kwargs["block_size"]) + cache_group.add_argument("--gpu-memory-utilization", **cache_kwargs["gpu_memory_utilization"]) - cache_group.add_argument('--swap-space', **cache_kwargs["swap_space"]) - cache_group.add_argument('--kv-cache-dtype', + cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"]) + cache_group.add_argument("--kv-cache-dtype", **cache_kwargs["cache_dtype"]) - cache_group.add_argument('--num-gpu-blocks-override', + cache_group.add_argument("--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]) cache_group.add_argument("--enable-prefix-caching", **cache_kwargs["enable_prefix_caching"]) cache_group.add_argument("--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]) - cache_group.add_argument('--cpu-offload-gb', + cache_group.add_argument("--cpu-offload-gb", **cache_kwargs["cpu_offload_gb"]) - cache_group.add_argument('--calculate-kv-scales', + cache_group.add_argument("--calculate-kv-scales", **cache_kwargs["calculate_kv_scales"]) - parser.add_argument('--use-v2-block-manager', - action='store_true', - default=True, - help='[DEPRECATED] block manager v1 has been ' - 'removed and SelfAttnBlockSpaceManager (i.e. ' - 'block manager v2) is now the default. ' - 'Setting this flag to True or False' - ' has no effect on vLLM behavior.') - - parser.add_argument('--disable-log-stats', - action='store_true', - help='Disable logging statistics.') - # Tokenizer arguments tokenizer_kwargs = get_kwargs(TokenizerPoolConfig) tokenizer_group = parser.add_argument_group( title="TokenizerPoolConfig", description=TokenizerPoolConfig.__doc__, ) - tokenizer_group.add_argument('--tokenizer-pool-size', + tokenizer_group.add_argument("--tokenizer-pool-size", **tokenizer_kwargs["pool_size"]) - tokenizer_group.add_argument('--tokenizer-pool-type', + tokenizer_group.add_argument("--tokenizer-pool-type", **tokenizer_kwargs["pool_type"]) - tokenizer_group.add_argument('--tokenizer-pool-extra-config', + tokenizer_group.add_argument("--tokenizer-pool-extra-config", **tokenizer_kwargs["extra_config"]) # Multimodal related configs @@ -591,13 +596,13 @@ class EngineArgs: title="MultiModalConfig", description=MultiModalConfig.__doc__, ) - multimodal_group.add_argument('--limit-mm-per-prompt', + multimodal_group.add_argument("--limit-mm-per-prompt", **multimodal_kwargs["limit_per_prompt"]) multimodal_group.add_argument( - '--mm-processor-kwargs', + "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"]) multimodal_group.add_argument( - '--disable-mm-preprocessor-cache', + "--disable-mm-preprocessor-cache", **multimodal_kwargs["disable_mm_preprocessor_cache"]) # LoRA related configs @@ -607,25 +612,25 @@ class EngineArgs: description=LoRAConfig.__doc__, ) lora_group.add_argument( - '--enable-lora', + "--enable-lora", action=argparse.BooleanOptionalAction, - help='If True, enable handling of LoRA adapters.') - lora_group.add_argument('--enable-lora-bias', + help="If True, enable handling of LoRA adapters.") + lora_group.add_argument("--enable-lora-bias", **lora_kwargs["bias_enabled"]) - lora_group.add_argument('--max-loras', **lora_kwargs["max_loras"]) - lora_group.add_argument('--max-lora-rank', + lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"]) + lora_group.add_argument("--max-lora-rank", **lora_kwargs["max_lora_rank"]) - lora_group.add_argument('--lora-extra-vocab-size', + lora_group.add_argument("--lora-extra-vocab-size", **lora_kwargs["lora_extra_vocab_size"]) lora_group.add_argument( - '--lora-dtype', + "--lora-dtype", **lora_kwargs["lora_dtype"], ) - lora_group.add_argument('--long-lora-scaling-factors', + lora_group.add_argument("--long-lora-scaling-factors", **lora_kwargs["long_lora_scaling_factors"]) - lora_group.add_argument('--max-cpu-loras', + lora_group.add_argument("--max-cpu-loras", **lora_kwargs["max_cpu_loras"]) - lora_group.add_argument('--fully-sharded-loras', + lora_group.add_argument("--fully-sharded-loras", **lora_kwargs["fully_sharded_loras"]) # PromptAdapter related configs @@ -635,14 +640,14 @@ class EngineArgs: description=PromptAdapterConfig.__doc__, ) prompt_adapter_group.add_argument( - '--enable-prompt-adapter', + "--enable-prompt-adapter", action=argparse.BooleanOptionalAction, - help='If True, enable handling of PromptAdapters.') + help="If True, enable handling of PromptAdapters.") prompt_adapter_group.add_argument( - '--max-prompt-adapters', + "--max-prompt-adapters", **prompt_adapter_kwargs["max_prompt_adapters"]) prompt_adapter_group.add_argument( - '--max-prompt-adapter-token', + "--max-prompt-adapter-token", **prompt_adapter_kwargs["max_prompt_adapter_token"]) # Device arguments @@ -659,25 +664,11 @@ class EngineArgs: description=SpeculativeConfig.__doc__, ) speculative_group.add_argument( - '--speculative-config', + "--speculative-config", type=json.loads, default=None, - help='The configurations for speculative decoding.' - ' Should be a JSON string.') - - parser.add_argument( - '--ignore-patterns', - action="append", - type=str, - default=[], - help="The pattern(s) to ignore when loading the model." - "Default to `original/**/*` to avoid repeated loading of llama's " - "checkpoints.") - - parser.add_argument('--qlora-adapter-name-or-path', - type=str, - default=None, - help='Name or path of the QLoRA adapter.') + help="The configurations for speculative decoding. Should be a " + "JSON string.") # Observability arguments observability_kwargs = get_kwargs(ObservabilityConfig) @@ -710,9 +701,9 @@ class EngineArgs: description=SchedulerConfig.__doc__, ) scheduler_group.add_argument( - '--max-num-batched-tokens', + "--max-num-batched-tokens", **scheduler_kwargs["max_num_batched_tokens"]) - scheduler_group.add_argument('--max-num-seqs', + scheduler_group.add_argument("--max-num-seqs", **scheduler_kwargs["max_num_seqs"]) scheduler_group.add_argument( "--max-num-partial-prefills", @@ -723,70 +714,78 @@ class EngineArgs: scheduler_group.add_argument( "--long-prefill-token-threshold", **scheduler_kwargs["long_prefill_token_threshold"]) - scheduler_group.add_argument('--num-lookahead-slots', + scheduler_group.add_argument("--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"]) - scheduler_group.add_argument('--scheduler-delay-factor', + scheduler_group.add_argument("--scheduler-delay-factor", **scheduler_kwargs["delay_factor"]) - scheduler_group.add_argument('--preemption-mode', + scheduler_group.add_argument("--preemption-mode", **scheduler_kwargs["preemption_mode"]) - scheduler_group.add_argument('--num-scheduler-steps', + scheduler_group.add_argument("--num-scheduler-steps", **scheduler_kwargs["num_scheduler_steps"]) scheduler_group.add_argument( - '--multi-step-stream-outputs', + "--multi-step-stream-outputs", **scheduler_kwargs["multi_step_stream_outputs"]) - scheduler_group.add_argument('--scheduling-policy', + scheduler_group.add_argument("--scheduling-policy", **scheduler_kwargs["policy"]) scheduler_group.add_argument( - '--enable-chunked-prefill', + "--enable-chunked-prefill", **scheduler_kwargs["enable_chunked_prefill"]) scheduler_group.add_argument( "--disable-chunked-mm-input", **scheduler_kwargs["disable_chunked_mm_input"]) - parser.add_argument('--scheduler-cls', - **scheduler_kwargs["scheduler_cls"]) + scheduler_group.add_argument("--scheduler-cls", + **scheduler_kwargs["scheduler_cls"]) - parser.add_argument('--compilation-config', - '-O', - type=CompilationConfig.from_cli, - default=None, - help='torch.compile configuration for the model. ' - 'When it is a number (0, 1, 2, 3), it will be ' - 'interpreted as the optimization level.\n' - 'NOTE: level 0 is the default level without ' - 'any optimization. level 1 and 2 are for internal ' - 'testing only. level 3 is the recommended level ' - 'for production.\n' - 'To specify the full compilation config, ' - 'use a JSON string, e.g. ``{"level": 3, ' - '"cudagraph_capture_sizes": [1, 2, 4, 8]}``\n' - 'Following the convention of traditional ' - 'compilers, using ``-O`` without space is also ' - 'supported. ``-O3`` is equivalent to ``-O 3``.') + # Compilation arguments + # compilation_kwargs = get_kwargs(CompilationConfig) + compilation_group = parser.add_argument_group( + title="CompilationConfig", + description=CompilationConfig.__doc__, + ) + compilation_group.add_argument( + "--compilation-config", + "-O", + type=CompilationConfig.from_cli, + default=None, + help="torch.compile configuration for the model. " + "When it is a number (0, 1, 2, 3), it will be " + "interpreted as the optimization level.\n" + "NOTE: level 0 is the default level without " + "any optimization. level 1 and 2 are for internal " + "testing only. level 3 is the recommended level " + "for production.\n" + "To specify the full compilation config, " + "use a JSON string, e.g. ``{\"level\": 3, " + "\"cudagraph_capture_sizes\": [1, 2, 4, 8]}``\n" + "Following the convention of traditional " + "compilers, using ``-O`` without space is also " + "supported. ``-O3`` is equivalent to ``-O 3``.") - parser.add_argument('--kv-transfer-config', - type=KVTransferConfig.from_cli, - default=None, - help='The configurations for distributed KV cache ' - 'transfer. Should be a JSON string.') - parser.add_argument('--kv-events-config', - type=KVEventsConfig.from_cli, - default=None, - help='The configurations for event publishing.') + # KVTransfer arguments + # kv_transfer_kwargs = get_kwargs(KVTransferConfig) + kv_transfer_group = parser.add_argument_group( + title="KVTransferConfig", + description=KVTransferConfig.__doc__, + ) + kv_transfer_group.add_argument( + "--kv-transfer-config", + type=KVTransferConfig.from_cli, + default=None, + help="The configurations for distributed KV cache " + "transfer. Should be a JSON string.") + kv_transfer_group.add_argument( + '--kv-events-config', + type=KVEventsConfig.from_cli, + default=None, + help='The configurations for event publishing.') - parser.add_argument( - '--worker-cls', - type=str, - default="auto", - help='The worker class to use for distributed execution.') - parser.add_argument( - '--worker-extension-cls', - type=str, - default="", - help='The worker extension class on top of the worker cls, ' - 'it is useful if you just want to add new functions to the worker ' - 'class without changing the existing functions.') - - parser.add_argument( + # vLLM arguments + # vllm_kwargs = get_kwargs(VllmConfig) + vllm_group = parser.add_argument_group( + title="VllmConfig", + description=VllmConfig.__doc__, + ) + vllm_group.add_argument( "--additional-config", type=json.loads, default=None, @@ -795,20 +794,18 @@ class EngineArgs: "configs are valid for the platform you are using. The input format" " is like '{\"config_key\":\"config_value\"}'") - parser.add_argument( - "--enable-reasoning", - action="store_true", - default=False, - help= - "[DEPRECATED] " \ - "The --enable-reasoning flag is deprecated as of v0.8.6. " - "Use --reasoning-parser to specify " \ - "the reasoning parser backend instead. " - "This flag (--enable-reasoning) will be " \ - "removed in v0.10.0. " - "When --reasoning-parser is specified, " \ - "reasoning mode is automatically enabled." - ) + # Other arguments + parser.add_argument('--use-v2-block-manager', + action='store_true', + default=True, + help='[DEPRECATED] block manager v1 has been ' + 'removed and SelfAttnBlockSpaceManager (i.e. ' + 'block manager v2) is now the default. ' + 'Setting this flag to True or False' + ' has no effect on vLLM behavior.') + parser.add_argument('--disable-log-stats', + action='store_true', + help='Disable logging statistics.') return parser