diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 5b4589410553b..5b946e191e453 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -40,13 +40,13 @@ def main(args: argparse.Namespace): def run_to_completion(profile: bool = False): if profile: torch.cuda.cudart().cudaProfilerStart() - start_time = time.time() + start_time = time.perf_counter() llm.generate(prompt_token_ids=dummy_prompt_token_ids, sampling_params=sampling_params, use_tqdm=False) - end_time = time.time() + end_time = time.perf_counter() latency = end_time - start_time if profile: torch.cuda.cudart().cudaProfilerStop() diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index d691c8a5f702a..3a80e679191e3 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -105,7 +105,7 @@ async def send_request( best_of: int, use_beam_search: bool, ) -> None: - request_start_time = time.time() + request_start_time = time.perf_counter() headers = {"User-Agent": "Benchmark Client"} if backend == "vllm": @@ -148,7 +148,7 @@ async def send_request( if "error" not in output: break - request_end_time = time.time() + request_end_time = time.perf_counter() request_latency = request_end_time - request_start_time REQUEST_LATENCY.append((prompt_len, output_len, request_latency)) @@ -180,10 +180,10 @@ def main(args: argparse.Namespace): tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code) input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer) - benchmark_start_time = time.time() + benchmark_start_time = time.perf_counter() asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of, args.use_beam_search, args.request_rate)) - benchmark_end_time = time.time() + benchmark_end_time = time.perf_counter() benchmark_time = benchmark_end_time - benchmark_start_time print(f"Total time: {benchmark_time:.2f} s") print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s") diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 5f8026ed3b90c..13df1a5a0c874 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -93,10 +93,10 @@ def run_vllm( sampling_params=sampling_params, ) - start = time.time() + start = time.perf_counter() # FIXME(woosuk): Do use internal method. llm._run_engine(use_tqdm=True) - end = time.time() + end = time.perf_counter() return end - start @@ -118,7 +118,7 @@ def run_hf( llm = llm.cuda() pbar = tqdm(total=len(requests)) - start = time.time() + start = time.perf_counter() batch: List[str] = [] max_prompt_len = 0 max_output_len = 0 @@ -156,7 +156,7 @@ def run_hf( batch = [] max_prompt_len = 0 max_output_len = 0 - end = time.time() + end = time.perf_counter() return end - start diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 18d6b3ed2ea75..8f381add8f396 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -121,7 +121,7 @@ class Scheduler: blocks_to_copy: Dict[int, List[int]] = {} # Fix the current time. - now = time.time() + now = time.monotonic() # Join waiting sequences if possible. if not self.swapped: diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 94674e9e6d8b8..aa77751242240 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -417,7 +417,8 @@ class AsyncLLMEngine: request. """ # Preprocess the request. - arrival_time = time.time() + # This should not be used for logging, as it is monotonic time. + arrival_time = time.monotonic() try: stream = await self.add_request(request_id, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 76cafd87684d7..74a8905a916d7 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -256,10 +256,10 @@ class LLMEngine: prompt_token_ids: The token IDs of the prompt. If None, we use the tokenizer to convert the prompts to token IDs. arrival_time: The arrival time of the request. If None, we use - the current time. + the current monotonic time. """ if arrival_time is None: - arrival_time = time.time() + arrival_time = time.monotonic() if prompt_token_ids is None: assert prompt is not None prompt_token_ids = self.tokenizer.encode(prompt) @@ -568,7 +568,7 @@ class LLMEngine: prompt_run: bool, num_batched_tokens: int, ) -> None: - now = time.time() + now = time.monotonic() # Log the number of batched input tokens. if prompt_run: self.num_prompt_tokens.append((now, num_batched_tokens)) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 7ec155d2e488c..64a4b8656a27a 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -210,7 +210,7 @@ async def create_chat_completion(request: ChatCompletionRequest, model_name = request.model request_id = f"cmpl-{random_uuid()}" - created_time = int(time.time()) + created_time = int(time.monotonic()) try: sampling_params = SamplingParams( n=request.n, @@ -411,7 +411,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request): if error_check_ret is not None: return error_check_ret - created_time = int(time.time()) + created_time = int(time.monotonic()) try: sampling_params = SamplingParams( n=request.n,