Use monotonic time where appropriate (#1249)

This commit is contained in:
Antoni Baum 2023-10-02 19:22:05 -07:00 committed by GitHub
parent 66d18a7fb0
commit acbed3ef40
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 18 additions and 17 deletions

View File

@ -40,13 +40,13 @@ def main(args: argparse.Namespace):
def run_to_completion(profile: bool = False): def run_to_completion(profile: bool = False):
if profile: if profile:
torch.cuda.cudart().cudaProfilerStart() torch.cuda.cudart().cudaProfilerStart()
start_time = time.time() start_time = time.perf_counter()
llm.generate(prompt_token_ids=dummy_prompt_token_ids, llm.generate(prompt_token_ids=dummy_prompt_token_ids,
sampling_params=sampling_params, sampling_params=sampling_params,
use_tqdm=False) use_tqdm=False)
end_time = time.time() end_time = time.perf_counter()
latency = end_time - start_time latency = end_time - start_time
if profile: if profile:
torch.cuda.cudart().cudaProfilerStop() torch.cuda.cudart().cudaProfilerStop()

View File

@ -105,7 +105,7 @@ async def send_request(
best_of: int, best_of: int,
use_beam_search: bool, use_beam_search: bool,
) -> None: ) -> None:
request_start_time = time.time() request_start_time = time.perf_counter()
headers = {"User-Agent": "Benchmark Client"} headers = {"User-Agent": "Benchmark Client"}
if backend == "vllm": if backend == "vllm":
@ -148,7 +148,7 @@ async def send_request(
if "error" not in output: if "error" not in output:
break break
request_end_time = time.time() request_end_time = time.perf_counter()
request_latency = request_end_time - request_start_time request_latency = request_end_time - request_start_time
REQUEST_LATENCY.append((prompt_len, output_len, request_latency)) REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
@ -180,10 +180,10 @@ def main(args: argparse.Namespace):
tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code) tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer) input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
benchmark_start_time = time.time() benchmark_start_time = time.perf_counter()
asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of, asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of,
args.use_beam_search, args.request_rate)) args.use_beam_search, args.request_rate))
benchmark_end_time = time.time() benchmark_end_time = time.perf_counter()
benchmark_time = benchmark_end_time - benchmark_start_time benchmark_time = benchmark_end_time - benchmark_start_time
print(f"Total time: {benchmark_time:.2f} s") print(f"Total time: {benchmark_time:.2f} s")
print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s") print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s")

View File

@ -93,10 +93,10 @@ def run_vllm(
sampling_params=sampling_params, sampling_params=sampling_params,
) )
start = time.time() start = time.perf_counter()
# FIXME(woosuk): Do use internal method. # FIXME(woosuk): Do use internal method.
llm._run_engine(use_tqdm=True) llm._run_engine(use_tqdm=True)
end = time.time() end = time.perf_counter()
return end - start return end - start
@ -118,7 +118,7 @@ def run_hf(
llm = llm.cuda() llm = llm.cuda()
pbar = tqdm(total=len(requests)) pbar = tqdm(total=len(requests))
start = time.time() start = time.perf_counter()
batch: List[str] = [] batch: List[str] = []
max_prompt_len = 0 max_prompt_len = 0
max_output_len = 0 max_output_len = 0
@ -156,7 +156,7 @@ def run_hf(
batch = [] batch = []
max_prompt_len = 0 max_prompt_len = 0
max_output_len = 0 max_output_len = 0
end = time.time() end = time.perf_counter()
return end - start return end - start

View File

@ -121,7 +121,7 @@ class Scheduler:
blocks_to_copy: Dict[int, List[int]] = {} blocks_to_copy: Dict[int, List[int]] = {}
# Fix the current time. # Fix the current time.
now = time.time() now = time.monotonic()
# Join waiting sequences if possible. # Join waiting sequences if possible.
if not self.swapped: if not self.swapped:

View File

@ -417,7 +417,8 @@ class AsyncLLMEngine:
request. request.
""" """
# Preprocess the request. # Preprocess the request.
arrival_time = time.time() # This should not be used for logging, as it is monotonic time.
arrival_time = time.monotonic()
try: try:
stream = await self.add_request(request_id, stream = await self.add_request(request_id,

View File

@ -256,10 +256,10 @@ class LLMEngine:
prompt_token_ids: The token IDs of the prompt. If None, we prompt_token_ids: The token IDs of the prompt. If None, we
use the tokenizer to convert the prompts to token IDs. use the tokenizer to convert the prompts to token IDs.
arrival_time: The arrival time of the request. If None, we use arrival_time: The arrival time of the request. If None, we use
the current time. the current monotonic time.
""" """
if arrival_time is None: if arrival_time is None:
arrival_time = time.time() arrival_time = time.monotonic()
if prompt_token_ids is None: if prompt_token_ids is None:
assert prompt is not None assert prompt is not None
prompt_token_ids = self.tokenizer.encode(prompt) prompt_token_ids = self.tokenizer.encode(prompt)
@ -568,7 +568,7 @@ class LLMEngine:
prompt_run: bool, prompt_run: bool,
num_batched_tokens: int, num_batched_tokens: int,
) -> None: ) -> None:
now = time.time() now = time.monotonic()
# Log the number of batched input tokens. # Log the number of batched input tokens.
if prompt_run: if prompt_run:
self.num_prompt_tokens.append((now, num_batched_tokens)) self.num_prompt_tokens.append((now, num_batched_tokens))

View File

@ -210,7 +210,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
model_name = request.model model_name = request.model
request_id = f"cmpl-{random_uuid()}" request_id = f"cmpl-{random_uuid()}"
created_time = int(time.time()) created_time = int(time.monotonic())
try: try:
sampling_params = SamplingParams( sampling_params = SamplingParams(
n=request.n, n=request.n,
@ -411,7 +411,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
if error_check_ret is not None: if error_check_ret is not None:
return error_check_ret return error_check_ret
created_time = int(time.time()) created_time = int(time.monotonic())
try: try:
sampling_params = SamplingParams( sampling_params = SamplingParams(
n=request.n, n=request.n,