mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-27 02:53:05 +08:00
14 lines
722 B
Python
14 lines
722 B
Python
def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None:
|
|
if num_gpu_blocks <= 0:
|
|
raise ValueError("No available memory for the cache blocks. "
|
|
"Try increasing `gpu_memory_utilization` when "
|
|
"initializing the engine.")
|
|
max_seq_len = block_size * num_gpu_blocks
|
|
if max_model_len > max_seq_len:
|
|
raise ValueError(
|
|
f"The model's max seq len ({max_model_len}) "
|
|
"is larger than the maximum number of tokens that can be "
|
|
f"stored in KV cache ({max_seq_len}). Try increasing "
|
|
"`gpu_memory_utilization` or decreasing `max_model_len` when "
|
|
"initializing the engine.")
|