mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 07:57:45 +08:00
Log system stats (#90)
This commit is contained in:
parent
8d66a7b6d7
commit
e331957784
@ -5,6 +5,7 @@ import time
|
|||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
from cacheflow.core.block_manager import BlockSpaceManager
|
from cacheflow.core.block_manager import BlockSpaceManager
|
||||||
|
from cacheflow.logger import init_logger
|
||||||
from cacheflow.core.policy import PolicyFactory
|
from cacheflow.core.policy import PolicyFactory
|
||||||
from cacheflow.sampling_params import SamplingParams
|
from cacheflow.sampling_params import SamplingParams
|
||||||
from cacheflow.sequence import Sequence
|
from cacheflow.sequence import Sequence
|
||||||
@ -14,6 +15,10 @@ from cacheflow.sequence import SequenceOutputs
|
|||||||
from cacheflow.sequence import SequenceStatus
|
from cacheflow.sequence import SequenceStatus
|
||||||
|
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
_LOGGING_INTERVAL_SEC = 10
|
||||||
|
|
||||||
|
|
||||||
class PreemptionMode(enum.Enum):
|
class PreemptionMode(enum.Enum):
|
||||||
"""Preemption modes.
|
"""Preemption modes.
|
||||||
|
|
||||||
@ -37,8 +42,7 @@ class Scheduler:
|
|||||||
num_cpu_blocks: int,
|
num_cpu_blocks: int,
|
||||||
max_num_batched_tokens: int,
|
max_num_batched_tokens: int,
|
||||||
max_num_sequences: int,
|
max_num_sequences: int,
|
||||||
collect_stats: bool,
|
log_stats: bool,
|
||||||
do_memory_analysis: bool = False,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
self.controllers = controllers
|
self.controllers = controllers
|
||||||
self.block_size = block_size
|
self.block_size = block_size
|
||||||
@ -46,8 +50,7 @@ class Scheduler:
|
|||||||
self.num_cpu_blocks = num_cpu_blocks
|
self.num_cpu_blocks = num_cpu_blocks
|
||||||
self.max_num_batched_tokens = max_num_batched_tokens
|
self.max_num_batched_tokens = max_num_batched_tokens
|
||||||
self.max_num_sequences = max_num_sequences
|
self.max_num_sequences = max_num_sequences
|
||||||
self.collect_stats = collect_stats
|
self.log_stats = log_stats
|
||||||
self.do_memory_analysis = do_memory_analysis
|
|
||||||
|
|
||||||
# Instantiate the scheduling policy.
|
# Instantiate the scheduling policy.
|
||||||
self.policy = PolicyFactory.get_policy(policy_name='fcfs')
|
self.policy = PolicyFactory.get_policy(policy_name='fcfs')
|
||||||
@ -69,8 +72,9 @@ class Scheduler:
|
|||||||
# Sequence groups in the SWAPPED state.
|
# Sequence groups in the SWAPPED state.
|
||||||
self.swapped: List[SequenceGroup] = []
|
self.swapped: List[SequenceGroup] = []
|
||||||
|
|
||||||
# Performance-related statistics.
|
self.last_logging_time: float = 0.0
|
||||||
self.stats = Stats(num_gpu_blocks, num_cpu_blocks)
|
# List[timestamp, num_tokens]
|
||||||
|
self.num_input_tokens: List[Tuple[float, int]] = []
|
||||||
|
|
||||||
def add_sequence_groups(
|
def add_sequence_groups(
|
||||||
self,
|
self,
|
||||||
@ -186,59 +190,46 @@ class Scheduler:
|
|||||||
num_batched_tokens += num_prompt_tokens
|
num_batched_tokens += num_prompt_tokens
|
||||||
prompt_group_ids.append(seq_group.group_id)
|
prompt_group_ids.append(seq_group.group_id)
|
||||||
|
|
||||||
if self.collect_stats:
|
if not self.log_stats:
|
||||||
if self.running or blocks_to_swap_in or blocks_to_swap_out:
|
return (blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy,
|
||||||
self.stats.timestamps.append(now - self.stats.start_time)
|
prompt_group_ids)
|
||||||
self.stats.input_lens.append(num_batched_tokens)
|
|
||||||
self.stats.swap_out_lens.append(len(blocks_to_swap_out) * self.block_size)
|
now = time.time()
|
||||||
self.stats.swap_in_lens.append(len(blocks_to_swap_in) * self.block_size)
|
if num_batched_tokens > 0:
|
||||||
self.stats.num_preemption.append(len(preempted))
|
self.num_input_tokens.append((now, num_batched_tokens))
|
||||||
self.stats.num_swapped.append(len(self.swapped))
|
elapsed_time = now - self.last_logging_time
|
||||||
self.stats.num_running.append(len(self.running))
|
if elapsed_time > _LOGGING_INTERVAL_SEC:
|
||||||
self.stats.num_waiting.append(len(self.waiting))
|
self.last_logging_time = now
|
||||||
|
self.num_input_tokens = [
|
||||||
|
(t, n) for t, n in self.num_input_tokens
|
||||||
|
if now - t < _LOGGING_INTERVAL_SEC
|
||||||
|
]
|
||||||
|
if len(self.num_input_tokens) > 1:
|
||||||
|
total_num_tokens = sum(n for _, n in self.num_input_tokens[:-1])
|
||||||
|
window = now - self.num_input_tokens[0][0]
|
||||||
|
avg_throughput = total_num_tokens / window
|
||||||
|
else:
|
||||||
|
avg_throughput = 0.0
|
||||||
|
|
||||||
num_free_gpu_blocks = self.block_manager.get_num_free_gpu_blocks()
|
num_free_gpu_blocks = self.block_manager.get_num_free_gpu_blocks()
|
||||||
num_used_gpu_blocks = self.num_gpu_blocks - num_free_gpu_blocks
|
num_used_gpu_blocks = self.num_gpu_blocks - num_free_gpu_blocks
|
||||||
self.stats.gpu_cache_usage.append(num_used_gpu_blocks / self.num_gpu_blocks)
|
gpu_cache_usage = num_used_gpu_blocks / self.num_gpu_blocks
|
||||||
|
if self.num_cpu_blocks > 0:
|
||||||
num_free_cpu_blocks = self.block_manager.get_num_free_cpu_blocks()
|
num_free_cpu_blocks = self.block_manager.get_num_free_cpu_blocks()
|
||||||
num_used_cpu_blocks = self.num_cpu_blocks - num_free_cpu_blocks
|
num_used_cpu_blocks = self.num_cpu_blocks - num_free_cpu_blocks
|
||||||
self.stats.cpu_cache_usage.append(num_used_cpu_blocks / self.num_cpu_blocks)
|
cpu_cache_usage = num_used_cpu_blocks / self.num_cpu_blocks
|
||||||
|
else:
|
||||||
|
cpu_cache_usage = 0.0
|
||||||
|
|
||||||
if self.do_memory_analysis:
|
logger.info(
|
||||||
block_tables = self.block_manager.block_tables
|
f"Throughput: {avg_throughput:.1f} tokens/s, "
|
||||||
num_logical_blocks = 0
|
f"Running: {len(self.running)} reqs, "
|
||||||
num_logical_tokens = 0
|
f"Swapped: {len(self.swapped)} reqs, "
|
||||||
num_physical_blocks = 0
|
f"Pending: {len(self.waiting)} reqs, "
|
||||||
num_physical_tokens = 0
|
f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, "
|
||||||
physical_block_numbers = set()
|
f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%")
|
||||||
num_reserved_tokens = 0
|
|
||||||
for seq_group in self.running:
|
|
||||||
group_id = seq_group.group_id
|
|
||||||
sampling_params = self.sampling_params[group_id]
|
|
||||||
max_num_steps = sampling_params.max_num_steps
|
|
||||||
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
|
|
||||||
num_logical_blocks += len(seq.logical_token_blocks)
|
|
||||||
num_logical_tokens += seq.get_len()
|
|
||||||
|
|
||||||
seq_id = seq.seq_id
|
return (blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy,
|
||||||
block_table = block_tables[seq_id]
|
|
||||||
for i, block in enumerate(block_table):
|
|
||||||
if block.block_number in physical_block_numbers:
|
|
||||||
continue
|
|
||||||
physical_block_numbers.add(block.block_number)
|
|
||||||
num_physical_blocks += 1
|
|
||||||
num_physical_tokens += seq.logical_token_blocks[i].num_tokens
|
|
||||||
|
|
||||||
assert num_physical_blocks == num_used_gpu_blocks
|
|
||||||
self.stats.num_logical_blocks.append(num_logical_blocks)
|
|
||||||
self.stats.num_logical_tokens.append(num_logical_tokens)
|
|
||||||
self.stats.num_physical_blocks.append(num_physical_blocks)
|
|
||||||
self.stats.num_physical_tokens.append(num_physical_tokens)
|
|
||||||
self.stats.num_reserved_tokens.append(num_reserved_tokens)
|
|
||||||
|
|
||||||
return (blocks_to_swap_in,
|
|
||||||
blocks_to_swap_out,
|
|
||||||
blocks_to_copy,
|
|
||||||
prompt_group_ids)
|
prompt_group_ids)
|
||||||
|
|
||||||
def step(self) -> List[SequenceGroup]:
|
def step(self) -> List[SequenceGroup]:
|
||||||
@ -455,75 +446,3 @@ class Scheduler:
|
|||||||
blocks_to_swap_out.update(mapping)
|
blocks_to_swap_out.update(mapping)
|
||||||
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
|
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
|
||||||
seq.status = SequenceStatus.SWAPPED
|
seq.status = SequenceStatus.SWAPPED
|
||||||
|
|
||||||
def reset_stats(self) -> None:
|
|
||||||
self.stats.reset(self.num_gpu_blocks, self.num_cpu_blocks)
|
|
||||||
|
|
||||||
def save_stats(
|
|
||||||
self,
|
|
||||||
output_dir: str,
|
|
||||||
) -> None:
|
|
||||||
assert self.collect_stats, 'Statistics collection is disabled.'
|
|
||||||
self.stats.save(output_dir)
|
|
||||||
|
|
||||||
|
|
||||||
class Stats:
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
num_gpu_blocks: int,
|
|
||||||
num_cpu_blocks: int,
|
|
||||||
) -> None:
|
|
||||||
self.start_time: float = time.time()
|
|
||||||
self.num_gpu_blocks = num_gpu_blocks
|
|
||||||
self.num_cpu_blocks = num_cpu_blocks
|
|
||||||
|
|
||||||
self.timestamps: List[float] = []
|
|
||||||
self.input_lens: List[int] = []
|
|
||||||
self.swap_out_lens: List[int] = []
|
|
||||||
self.swap_in_lens: List[int] = []
|
|
||||||
self.num_preemption: List[int] = []
|
|
||||||
self.num_waiting: List[int] = []
|
|
||||||
self.num_running: List[int] = []
|
|
||||||
self.num_swapped: List[int] = []
|
|
||||||
self.gpu_cache_usage: List[float] = []
|
|
||||||
self.cpu_cache_usage: List[float] = []
|
|
||||||
|
|
||||||
self.num_logical_blocks: List[int] = []
|
|
||||||
self.num_logical_tokens: List[int] = []
|
|
||||||
self.num_physical_blocks: List[int] = []
|
|
||||||
self.num_physical_tokens: List[int] = []
|
|
||||||
self.num_reserved_tokens: List[int] = []
|
|
||||||
|
|
||||||
def reset(
|
|
||||||
self,
|
|
||||||
num_gpu_blocks: int,
|
|
||||||
num_cpu_blocks: int,
|
|
||||||
) -> None:
|
|
||||||
self.__init__(num_gpu_blocks, num_cpu_blocks)
|
|
||||||
|
|
||||||
def to_dict(self) -> Dict[str, Any]:
|
|
||||||
return {
|
|
||||||
'start_time': self.start_time,
|
|
||||||
'num_gpu_blocks': self.num_gpu_blocks,
|
|
||||||
'num_cpu_blocks': self.num_cpu_blocks,
|
|
||||||
'timestamps': self.timestamps,
|
|
||||||
'input_lens': self.input_lens,
|
|
||||||
'swap_out_lens': self.swap_out_lens,
|
|
||||||
'swap_in_lens': self.swap_in_lens,
|
|
||||||
'num_preemption': self.num_preemption,
|
|
||||||
'num_waiting': self.num_waiting,
|
|
||||||
'num_running': self.num_running,
|
|
||||||
'num_swapped': self.num_swapped,
|
|
||||||
'gpu_cache_usage': self.gpu_cache_usage,
|
|
||||||
'cpu_cache_usage': self.cpu_cache_usage,
|
|
||||||
'num_logical_blocks': self.num_logical_blocks,
|
|
||||||
'num_logical_tokens': self.num_logical_tokens,
|
|
||||||
'num_physical_blocks': self.num_physical_blocks,
|
|
||||||
'num_physical_tokens': self.num_physical_tokens,
|
|
||||||
'num_reserved_tokens': self.num_reserved_tokens,
|
|
||||||
}
|
|
||||||
|
|
||||||
def save(self, output_dir: str) -> None:
|
|
||||||
with open(os.path.join(output_dir, 'stats.pkl'), 'wb') as f:
|
|
||||||
pickle.dump(self.to_dict(), f)
|
|
||||||
|
|||||||
@ -44,18 +44,16 @@ class Server:
|
|||||||
gpu_memory: int,
|
gpu_memory: int,
|
||||||
cpu_memory: int,
|
cpu_memory: int,
|
||||||
use_ray: bool,
|
use_ray: bool,
|
||||||
collect_stats: bool = False,
|
log_stats: bool,
|
||||||
do_memory_analysis: bool = False,
|
|
||||||
):
|
):
|
||||||
logger.info(
|
logger.info(
|
||||||
"Initializing a server with config: "
|
"Initializing a server with config: "
|
||||||
f"model={model!r}, "
|
f"model={model!r}, "
|
||||||
f"dtype={dtype}, "
|
f"dtype={dtype}, "
|
||||||
f"use_dummy_weights={use_dummy_weights}, "
|
f"use_dummy_weights={use_dummy_weights}, "
|
||||||
f"cache_dir={cache_dir}, "
|
f"cache_dir={cache_dir!r}, "
|
||||||
f"use_np_cache={use_np_cache}, "
|
f"use_np_cache={use_np_cache}, "
|
||||||
f"tensor_parallel_size={tensor_parallel_size}, "
|
f"tensor_parallel_size={tensor_parallel_size}, "
|
||||||
f"block_size={block_size}, "
|
|
||||||
f"seed={seed})"
|
f"seed={seed})"
|
||||||
)
|
)
|
||||||
self.num_nodes = num_nodes
|
self.num_nodes = num_nodes
|
||||||
@ -111,8 +109,7 @@ class Server:
|
|||||||
num_cpu_blocks=self.num_cpu_blocks,
|
num_cpu_blocks=self.num_cpu_blocks,
|
||||||
max_num_batched_tokens=max_num_batched_tokens,
|
max_num_batched_tokens=max_num_batched_tokens,
|
||||||
max_num_sequences=max_num_sequences,
|
max_num_sequences=max_num_sequences,
|
||||||
collect_stats=collect_stats,
|
log_stats=log_stats,
|
||||||
do_memory_analysis=do_memory_analysis,
|
|
||||||
)
|
)
|
||||||
# Connect the controllers.
|
# Connect the controllers.
|
||||||
for i in range(len(self.controllers) - 1):
|
for i in range(len(self.controllers) - 1):
|
||||||
@ -244,6 +241,7 @@ def add_server_arguments(parser: argparse.ArgumentParser):
|
|||||||
parser.add_argument('--swap-space', type=int, default=20, help='CPU swap space size (GiB) per GPU')
|
parser.add_argument('--swap-space', type=int, default=20, help='CPU swap space size (GiB) per GPU')
|
||||||
parser.add_argument('--max-num-batched-tokens', type=int, default=2560, help='maximum number of batched tokens per iteration')
|
parser.add_argument('--max-num-batched-tokens', type=int, default=2560, help='maximum number of batched tokens per iteration')
|
||||||
parser.add_argument('--max-num-sequences', type=int, default=256, help='maximum number of sequences per iteration')
|
parser.add_argument('--max-num-sequences', type=int, default=256, help='maximum number of sequences per iteration')
|
||||||
|
parser.add_argument('--log-stats', action='store_true', help='log system statistics')
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
@ -286,6 +284,7 @@ def init_local_server_and_frontend_with_arguments(args: argparse.Namespace):
|
|||||||
gpu_memory=get_gpu_memory(),
|
gpu_memory=get_gpu_memory(),
|
||||||
cpu_memory=get_cpu_memory(),
|
cpu_memory=get_cpu_memory(),
|
||||||
use_ray=args.use_ray,
|
use_ray=args.use_ray,
|
||||||
|
log_stats=args.log_stats,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create a frontend.
|
# Create a frontend.
|
||||||
|
|||||||
@ -91,7 +91,6 @@ class Worker:
|
|||||||
initialize_model_parallel(tensor_parallel_size,
|
initialize_model_parallel(tensor_parallel_size,
|
||||||
pipeline_parallel_size)
|
pipeline_parallel_size)
|
||||||
|
|
||||||
|
|
||||||
def prepare_inputs(
|
def prepare_inputs(
|
||||||
self,
|
self,
|
||||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user