[Core] Make scheduling policy settable via EngineArgs (#8956)

This commit is contained in:
Sebastian Schoennenbeck 2024-09-30 14:28:44 +02:00 committed by GitHub
parent 2ae25f79cf
commit be76e5aabf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -2,8 +2,8 @@ import argparse
import dataclasses
import json
from dataclasses import dataclass
from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
Type, Union)
from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional,
Tuple, Type, Union)
import torch
@ -177,6 +177,7 @@ class EngineArgs:
disable_async_output_proc: bool = False
override_neuron_config: Optional[Dict[str, Any]] = None
mm_processor_kwargs: Optional[Dict[str, Any]] = None
scheduling_policy: Literal["fcfs", "priority"] = "fcfs"
def __post_init__(self):
if self.tokenizer is None:
@ -797,6 +798,16 @@ class EngineArgs:
default=None,
help="override or set neuron device configuration.")
parser.add_argument(
'--scheduling-policy',
choices=['fcfs', 'priority'],
default="fcfs",
help='The scheduling policy to use. "fcfs" (first come first served'
', i.e. requests are handled in order of arrival; default) '
'or "priority" (requests are handled based on given '
'priority (lower value means earlier handling) and time of '
'arrival deciding any ties).')
return parser
@classmethod
@ -1011,6 +1022,7 @@ class EngineArgs:
multi_step_stream_outputs=self.multi_step_stream_outputs,
send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
and parallel_config.use_ray),
policy=self.scheduling_policy,
)
lora_config = LoRAConfig(
max_lora_rank=self.max_lora_rank,