diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py index 2b41834baf4d6..dc2bf0e79cbce 100644 --- a/benchmarks/benchmark_guided.py +++ b/benchmarks/benchmark_guided.py @@ -46,6 +46,12 @@ def run_vllm(requests: List[SampleRequest], warmup: bool = False) -> float: from vllm import LLM, SamplingParams llm = LLM(**vars(engine_args)) + assert all( + llm.llm_engine.model_config.max_model_len >= ( + request.prompt_len + request.expected_output_len) + for request in requests), ( + "Please ensure that max_model_len is greater than the sum of" + " prompt_len and expected_output_len for all requests.") # Add the requests to the engine. prompts: List[str] = [] @@ -115,6 +121,13 @@ async def run_vllm_async( async with build_async_engine_client_from_engine_args( engine_args, disable_frontend_multiprocessing) as llm: + assert all( + llm.model_config.max_model_len >= (request.prompt_len + + request.expected_output_len) + for request in requests), ( + "Please ensure that max_model_len is greater than the sum of" + " prompt_len and expected_output_len for all requests.") + # Add the requests to the engine. prompts: List[str] = [] sampling_params: List[SamplingParams] = [] diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index b041626550b54..b1d68ea246945 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -42,6 +42,10 @@ def main(args: argparse.Namespace): # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. llm = LLM(**dataclasses.asdict(engine_args)) + assert llm.llm_engine.model_config.max_model_len >= ( + args.input_len + args.output_len), ( + "Please ensure that max_model_len is greater than" + " the sum of input_len and output_len.") sampling_params = SamplingParams( n=args.n, diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index a32065e4e7c0f..24014e5b6c373 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -13,6 +13,11 @@ from vllm.engine.arg_utils import EngineArgs from vllm.utils import FlexibleArgumentParser +#Select a equi-probable random priority +def get_random_flag(): + return 0 if random.random() < 0.5 else 1 + + def sample_requests( dataset_path: str, num_requests: int, @@ -55,8 +60,7 @@ def sample_requests( # Prune too long sequences. continue - #Select a equi-probable random priority - priority = 0 if random.random() < 0.5 else 1 + priority = get_random_flag() filtered_dataset.append((prompt, prompt_len, output_len, priority)) @@ -71,6 +75,12 @@ def run_vllm( from vllm import LLM, SamplingParams llm = LLM(**dataclasses.asdict(engine_args)) + assert all( + llm.llm_engine.model_config.max_model_len >= (request[1] + request[2]) + for request in requests), ( + "Please ensure that max_model_len is greater than the sum of" + " input_len and output_len for all requests.") + # Add the requests to the engine. prompts = [] sampling_params = [] @@ -103,8 +113,8 @@ def main(args: argparse.Namespace): if args.dataset is None: # Synthesize a prompt with the given input length. prompt = "hi" * (args.input_len - 1) - requests = [(prompt, args.input_len, args.output_len) - for _ in range(args.num_prompts)] + requests = [(prompt, args.input_len, args.output_len, + get_random_flag()) for _ in range(args.num_prompts)] else: requests = sample_requests(args.dataset, args.num_prompts, tokenizer, args.output_len) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index f7d87f1b336f4..ca54213c0646c 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -171,7 +171,12 @@ def run_vllm( ) -> float: from vllm import LLM, SamplingParams llm = LLM(**dataclasses.asdict(engine_args)) - + assert all( + llm.llm_engine.model_config.max_model_len >= ( + request.prompt_len + request.expected_output_len) + for request in requests), ( + "Please ensure that max_model_len is greater than the sum of" + " prompt_len and expected_output_len for all requests.") # Add the requests to the engine. prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] @@ -229,6 +234,12 @@ async def run_vllm_async( async with build_async_engine_client_from_engine_args( engine_args, disable_frontend_multiprocessing) as llm: + assert all( + llm.model_config.max_model_len >= (request.prompt_len + + request.expected_output_len) + for request in requests), ( + "Please ensure that max_model_len is greater than the sum of" + " prompt_len and expected_output_len for all requests.") # Add the requests to the engine. prompts: List[TextPrompt] = []