mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-22 02:15:28 +08:00
Support max-model-len argument for throughput benchmark (#1858)
This commit is contained in:
parent
51d3cb951d
commit
8d8c2f6ffe
@ -69,6 +69,7 @@ def run_vllm(
|
|||||||
use_beam_search: bool,
|
use_beam_search: bool,
|
||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
|
max_model_len: Optional[int] = None,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
@ -79,6 +80,7 @@ def run_vllm(
|
|||||||
seed=seed,
|
seed=seed,
|
||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
|
max_model_len=max_model_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
@ -201,7 +203,8 @@ def main(args: argparse.Namespace):
|
|||||||
elapsed_time = run_vllm(requests, args.model, args.tokenizer,
|
elapsed_time = run_vllm(requests, args.model, args.tokenizer,
|
||||||
args.quantization, args.tensor_parallel_size,
|
args.quantization, args.tensor_parallel_size,
|
||||||
args.seed, args.n, args.use_beam_search,
|
args.seed, args.n, args.use_beam_search,
|
||||||
args.trust_remote_code, args.dtype)
|
args.trust_remote_code, args.dtype,
|
||||||
|
args.max_model_len)
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||||
@ -261,6 +264,12 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument('--trust-remote-code',
|
parser.add_argument('--trust-remote-code',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help='trust remote code from huggingface')
|
help='trust remote code from huggingface')
|
||||||
|
parser.add_argument(
|
||||||
|
'--max-model-len',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help='Maximum length of a sequence (including prompt and output). '
|
||||||
|
'If None, will be derived from the model.')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--dtype',
|
'--dtype',
|
||||||
type=str,
|
type=str,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user