diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 9ae5aa9b42da..8a8639cf5425 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -67,12 +67,14 @@ def run_vllm( seed: int, n: int, use_beam_search: bool, + trust_remote_code: bool, ) -> float: llm = LLM( model=model, tokenizer=tokenizer, tensor_parallel_size=tensor_parallel_size, seed=seed, + trust_remote_code=trust_remote_code ) # Add the requests to the engine. @@ -106,9 +108,10 @@ def run_hf( n: int, use_beam_search: bool, max_batch_size: int, + trust_remote_code: bool, ) -> float: assert not use_beam_search - llm = AutoModelForCausalLM.from_pretrained(model, torch_dtype=torch.float16) + llm = AutoModelForCausalLM.from_pretrained(model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) if llm.config.model_type == "llama": # To enable padding in the HF backend. tokenizer.pad_token = tokenizer.eos_token @@ -161,13 +164,13 @@ def main(args: argparse.Namespace): random.seed(args.seed) # Sample the requests. - tokenizer = get_tokenizer(args.tokenizer) + tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code) requests = sample_requests(args.dataset, args.num_prompts, tokenizer) if args.backend == "vllm": elapsed_time = run_vllm( requests, args.model, args.tokenizer, args.tensor_parallel_size, - args.seed, args.n, args.use_beam_search) + args.seed, args.n, args.use_beam_search, args.trust_remote_code) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -199,6 +202,9 @@ if __name__ == "__main__": parser.add_argument("--seed", type=int, default=0) parser.add_argument("--hf-max-batch-size", type=int, default=None, help="Maximum batch size for HF backend.") + parser.add_argument('--trust-remote-code', + action='store_true', + help='trust remote code from huggingface') args = parser.parse_args() if args.backend == "vllm":