mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 18:35:58 +08:00
enable --gpu-memory-utilization in benchmark_throughput.py (#3175)
Co-authored-by: zixiao <shunli.dsl@alibaba-inc.com>
This commit is contained in:
parent
27a7b070db
commit
9cbc7e5f3b
@ -74,6 +74,7 @@ def run_vllm(
|
|||||||
kv_cache_dtype: str,
|
kv_cache_dtype: str,
|
||||||
device: str,
|
device: str,
|
||||||
enable_prefix_caching: bool,
|
enable_prefix_caching: bool,
|
||||||
|
gpu_memory_utilization: float = 0.9,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(model=model,
|
llm = LLM(model=model,
|
||||||
@ -84,6 +85,7 @@ def run_vllm(
|
|||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
max_model_len=max_model_len,
|
max_model_len=max_model_len,
|
||||||
|
gpu_memory_utilization=gpu_memory_utilization,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=enforce_eager,
|
||||||
kv_cache_dtype=kv_cache_dtype,
|
kv_cache_dtype=kv_cache_dtype,
|
||||||
device=device,
|
device=device,
|
||||||
@ -206,13 +208,12 @@ def main(args: argparse.Namespace):
|
|||||||
args.output_len)
|
args.output_len)
|
||||||
|
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
elapsed_time = run_vllm(requests, args.model, args.tokenizer,
|
elapsed_time = run_vllm(
|
||||||
args.quantization, args.tensor_parallel_size,
|
requests, args.model, args.tokenizer, args.quantization,
|
||||||
args.seed, args.n, args.use_beam_search,
|
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
|
||||||
args.trust_remote_code, args.dtype,
|
args.trust_remote_code, args.dtype, args.max_model_len,
|
||||||
args.max_model_len, args.enforce_eager,
|
args.enforce_eager, args.kv_cache_dtype, args.device,
|
||||||
args.kv_cache_dtype, args.device,
|
args.enable_prefix_caching, args.gpu_memory_utilization)
|
||||||
args.enable_prefix_caching)
|
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||||
@ -287,6 +288,12 @@ if __name__ == "__main__":
|
|||||||
'The "auto" option will use FP16 precision '
|
'The "auto" option will use FP16 precision '
|
||||||
'for FP32 and FP16 models, and BF16 precision '
|
'for FP32 and FP16 models, and BF16 precision '
|
||||||
'for BF16 models.')
|
'for BF16 models.')
|
||||||
|
parser.add_argument('--gpu-memory-utilization',
|
||||||
|
type=float,
|
||||||
|
default=0.9,
|
||||||
|
help='the fraction of GPU memory to be used for '
|
||||||
|
'the model executor, which can range from 0 to 1.'
|
||||||
|
'If unspecified, will use the default value of 0.9.')
|
||||||
parser.add_argument("--enforce-eager",
|
parser.add_argument("--enforce-eager",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="enforce eager execution")
|
help="enforce eager execution")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user